In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
import joblib
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv("Data_Analytics_Task.csv")


In [None]:
# First 5 rows dekhne ke liye
print(df.head())
print(df.info())

In [None]:
# DateTime format convert karna
df['slot_start_time'] = pd.to_datetime(df['slot_start_time'])
df['payment_time'] = pd.to_datetime(df['payment_time'], errors='coerce')

In [None]:
# Naya column: Conversion hua ya nahi
df['converted'] = np.where(df['payment_time'].notnull(),1,0)

In [None]:
def conversion_within_days(x, days):
    return ((x['payment_time'] - x['slot_start_time']).dt.days <= days) & (x['converted'] == 1)

df['conversion_3d'] = conversion_within_days(df, 3)
df['conversion_7d'] = conversion_within_days(df, 7)


In [None]:
# work on a copy
df2 = df.copy()

df2.columns = df2.columns.str.strip().str.lower().str.replace(' ', '_')

if 'lead_type' not in df2.columns:
    if 'india_vs_nri' in df2.columns:
        df2 = df2.rename(columns={'india_vs_nri': 'lead_type'})
    else:
        fallback = None
        for cand in ['medicalconditionflag', 'target_class', 'booked_flag', 'user_id']:
            if cand in df2.columns:
                fallback = cand
                break
        if fallback:
            df2['lead_type'] = df2[fallback].astype(str)
        else:
            raise KeyError("No suitable column found to use as 'lead_type'. Please tell me which column should be used.")

# Ensure required time columns exist
required = ['slot_start_time', 'payment_time', 'funnel']
missing = [c for c in required if c not in df2.columns]
if missing:
    raise KeyError(f"Missing required column(s): {missing}. Available columns: {df2.columns.tolist()}")

# Convert to datetime
df2['slot_start_time'] = pd.to_datetime(df2['slot_start_time'], errors='coerce')
df2['payment_time']    = pd.to_datetime(df2['payment_time'], errors='coerce')

# how many missing datetimes
print("Missing slot_start_time:", df2['slot_start_time'].isna().sum())
print("Missing payment_time   :", df2['payment_time'].isna().sum())

df2['delta_days'] = (df2['payment_time'] - df2['slot_start_time']).dt.days


df2['conversion_3d'] = np.where(df2['delta_days'].notna() & (df2['delta_days'] >= 0) & (df2['delta_days'] <= 3), 1, 0)
df2['conversion_7d'] = np.where(df2['delta_days'].notna() & (df2['delta_days'] >= 0) & (df2['delta_days'] <= 7), 1, 0)


conversion_stats = (
    df2.groupby(['funnel', 'lead_type'])[['conversion_3d', 'conversion_7d']]
    .mean()
    .reset_index()
)

print(conversion_stats)




In [None]:
# Slot hour nikalna
df['hour'] = df['slot_start_time'].dt.hour


In [None]:
# Har hour me conversion rate
hourly_sales = df.groupby('hour')['converted'].mean().reset_index()

In [None]:
plt.figure(figsize=(10,5))
sns.lineplot(data=hourly_sales, x='hour', y='converted', marker='o')
plt.title("Best Hours for Conversion")
plt.xlabel("Hour of Day")
plt.ylabel("Conversion Rate")
plt.show()

In [None]:
coach_performance = df.groupby('target_class')['converted'].mean().reset_index()

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(data=coach_performance, x='target_class', y='converted', order=['A','B','C','D'])
plt.title("Coach Class vs Conversion Rate")
plt.show()

In [None]:
df.columns

In [None]:
# Country type comparison
country_insights = df.groupby('India vs NRI')['converted'].mean().reset_index()
country_insights.rename(columns={'converted': 'conversion_rate'}, inplace=True)

print("Conversion by Country Type:\n", country_insights)

In [None]:
# Funnel wise total conversions
funnel_insights = df.groupby('funnel')['converted'].mean().reset_index()
print("Conversion by Funnel:\n", funnel_insights)


In [None]:
print("🔹 Recommendations:")
print("1. Focus on Funnel + Lead Types with higher 7-day conversion.")
print("2. Run campaigns during peak conversion hours:", hourly_sales.sort_values('converted', ascending=False).head(3))
print("3. Assign high-value leads to 'A' & 'B' coaches for max sales.")
print("4. Consider special strategy for NRI vs India leads depending on performance.")

In [None]:
# model.pkl
X = df.select_dtypes(include="number").drop(columns=["converted", "hour"], errors="ignore")
y = df["converted"]

model = RandomForestClassifier()
model.fit(X, y)

joblib.dump(model, "model.pkl")
print(" Model saved as model.pkl")