<a href="https://colab.research.google.com/github/Bobbsicle27/VandanaK/blob/main/dengro_conversions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
# Install required libraries (if not already installed)
!pip install pandas matplotlib openpyxl xlsxwriter

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
from datetime import datetime

# Upload the file
print("Please upload your CSV file:")
uploaded = files.upload()

# Assuming the first (and only) uploaded file is your leads CSV
file_name = list(uploaded.keys())[0]
leads_data = pd.read_csv(file_name)

# Convert the 'Date added' column to datetime format
leads_data['Date added'] = pd.to_datetime(
    leads_data['Date added'],
    format='%Y-%m-%d %H:%M:%S',
    errors='coerce'
)

# Normalize the 'Lead Initial treatment' column
leads_data['Lead Initial treatment'] = (
    leads_data['Lead Initial treatment']
    .str.lower()
    .str.strip()
)

# --- Prompt user for date range ---
# If you want to enforce input every time:
start_date_input = ("20/12/2024")
end_date_input   = ("31/01/2025")

# Alternatively, to supply defaults if the user just presses Enter:
# default_start = "01/01/2025"
# default_end   = "15/04/2025"
# start_date_input = input(f"Enter the start date (dd/mm/yyyy) [{default_start}]: ") or default_start
# end_date_input   = input(f"Enter the end date   (dd/mm/yyyy) [{default_end}]: ")   or default_end

# Parse to datetime
start_date = datetime.strptime(start_date_input, '%d/%m/%Y')
end_date   = datetime.strptime(end_date_input,   '%d/%m/%Y')

# Filter by date range
filtered_data = leads_data[
    (leads_data['Date added'] >= start_date) &
    (leads_data['Date added'] <= end_date)
]

# --- Prompt user for treatment options ---
treatment_options_input = input(
    "Enter the treatment options (comma-separated) or leave blank for all: "
).strip()

if treatment_options_input:
    treatment_options = [
        x.strip().lower()
        for x in treatment_options_input.split(',')
    ]
    filtered_data = filtered_data[
        filtered_data['Lead Initial treatment'].isin(treatment_options)
    ]
    treatments_included = ', '.join(treatment_options)
else:
    treatments_included = "All"

# Debugging Check: Total leads after filtering
total_leads_received = len(filtered_data)

# Verify channels
unique_channels = filtered_data['Channel'].dropna().unique()
leads_by_channel = filtered_data.groupby('Channel').size().to_dict()
total_leads_from_channels = sum(leads_by_channel.values())

if total_leads_from_channels != total_leads_received:
    raise ValueError(
        "Channel filtering discrepancy detected! "
        "Total leads from channels does not match total leads received."
    )

# Create funnel‑stage flags
filtered_data['is_consultation_booked']  = filtered_data['Date booked appointment'].notnull()
filtered_data['is_consultation_attended']= filtered_data['Date attended appointment'].notnull()
filtered_data['is_treatment_started']    = filtered_data['Treatment start date'].notnull()

# Count by stage
funnel_stage_counts = {
    "New Leads": len(filtered_data[filtered_data['Journey Stage'] == "New"]),
    "Consultation Booked": filtered_data['is_consultation_booked'].sum(),
    "Consultation Attended": filtered_data['is_consultation_attended'].sum(),
    "Treatment Started": filtered_data['is_treatment_started'].sum(),
}

# Calculate percentages
funnel_stage_percentages = {
    stage: (count / total_leads_received) * 100 if total_leads_received > 0 else 0
    for stage, count in funnel_stage_counts.items()
}

# Build summary table
summary_table = pd.DataFrame({
    "Stage": list(funnel_stage_counts.keys()),
    "Count": list(funnel_stage_counts.values()),
    "Percentage (%)": list(funnel_stage_percentages.values()),
})
summary_table.loc[len(summary_table)] = ["Total Leads", total_leads_received, 100]

# Prepare data for stacked bar chart by channel
stages = ["New Leads", "Consultation Booked", "Consultation Attended", "Treatment Started"]
channel_stage_data = {ch: [] for ch in unique_channels}

for ch in unique_channels:
    ch_df = filtered_data[filtered_data['Channel'] == ch]
    channel_stage_data[ch] = [
        len(ch_df[ch_df['Journey Stage'] == "New"]),
        ch_df['is_consultation_booked'].sum(),
        ch_df['is_consultation_attended'].sum(),
        ch_df['is_treatment_started'].sum(),
    ]

# Plot the stacked bar chart
plt.figure(figsize=(12, 8))
bottom = [0] * len(stages)
for ch, counts in channel_stage_data.items():
    plt.bar(stages, counts, bottom=bottom, label=ch)
    bottom = [b + c for b, c in zip(bottom, counts)]

plt.title("Journey Stage Funnel (Stacked by Channel)", fontsize=14)
plt.xlabel("Journey Stages", fontsize=12)
plt.ylabel("Number of Leads", fontsize=12)
plt.legend(title="Channels")
plt.tight_layout()

# Save chart
chart_filename = "stacked_bar_chart_by_stages_and_channels.png"
plt.savefig(chart_filename, dpi=300)
plt.close()

# Export to Excel
output_filename = "Journey_Stage_Funnel_Monthly_BB.xlsx"
with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
    # Metadata sheet
    metadata_df = pd.DataFrame({
        "Period": [f"{start_date_input} to {end_date_input}"],
        "Treatments Included": [treatments_included],
        "Total Leads Received": [total_leads_received],
        "Leads by Channel (Sum)": [total_leads_from_channels],
    })
    metadata_df.to_excel(writer, sheet_name="Metadata", index=False)

    # Funnel summary sheet
    summary_table.to_excel(writer, sheet_name="Funnel Summary", index=False)

    # Chart sheet
    workbook  = writer.book
    worksheet = workbook.add_worksheet("Chart")
    worksheet.insert_image("B2", chart_filename)

# Trigger download in Colab
files.download(output_filename)


Please upload your CSV file:


Saving dengro-lead-export-0f226590-9ce8-477c-9893-98ac3279c98c.csv to dengro-lead-export-0f226590-9ce8-477c-9893-98ac3279c98c (5).csv
Enter the treatment options (comma-separated) or leave blank for all: 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['is_consultation_booked']  = filtered_data['Date booked appointment'].notnull()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['is_consultation_attended']= filtered_data['Date attended appointment'].notnull()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>