In [4]:
import pandas as pd

# Load dataset
df = pd.read_csv("customer_conversion_testing_dataset[1].csv")

print(df.shape)
df.head()


(26145, 19)


Unnamed: 0,LeadID,Age,Gender,Location,LeadSource,TimeSpent (minutes),PagesViewed,LeadStatus,EmailSent,DeviceType,ReferralSource,FormSubmissions,Downloads,CTR_ProductPage,ResponseTime (hours),FollowUpEmails,SocialMediaEngagement,PaymentHistory,Conversion (Target)
0,1,60,Female,Lahore,Organic,46,6,Hot,10,Mobile,Facebook,2,3,0.8,11,3,54,Good,0
1,2,27,Male,Lahore,Email,42,8,Warm,6,Tablet,Direct,1,1,0.4,23,2,35,No Payment,0
2,3,21,Male,Sialkot,Email,56,11,Cold,1,Tablet,Direct,5,3,0.5,8,0,34,Good,0
3,4,37,Female,Quetta,Organic,24,15,Cold,6,Tablet,Direct,1,1,0.7,10,3,59,Good,0
4,5,35,Female,Quetta,Email,39,14,Hot,10,Desktop,Direct,1,3,0.5,13,5,35,Good,0


In [5]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")


In [8]:
df.isnull().sum()

# Example strategy
df = df.drop_duplicates()

df['age'] = df['age'].fillna(df['age'].median())
df['gender'] = df['gender'].fillna("Unknown")


In [9]:
df['conversion_(target)'] = df['conversion_(target)'].astype(int)
df['emailsent'] = df['emailsent'].astype(int)
df['formsubmissions'] = df['formsubmissions'].astype(int)
df['downloads'] = df['downloads'].astype(int)


In [10]:
df['engagement_score'] = (
    df['timespent_(minutes)'] * 0.4 +
    df['pagesviewed'] * 0.3 +
    df['socialmediaengagement'] * 0.3
)


In [11]:
df['high_intent_flag'] = (
    (df['formsubmissions'] > 0) |
    (df['downloads'] > 0)
).astype(int)


In [12]:
bins = [18, 25, 35, 50, 100]
labels = ['18-25', '26-35', '36-50', '50+']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)


In [13]:
customers = df[['leadid', 'age', 'gender', 'location', 'age_group']]


In [14]:
interactions = df[[
    'leadid',
    'timespent_(minutes)',
    'pagesviewed',
    'socialmediaengagement',
    'engagement_score'
]]


In [15]:
conversion = df[[
    'leadid',
    'high_intent_flag',
    'conversion_(target)'
]]


In [17]:
import os

# Create folders if they don't exist
os.makedirs("raw", exist_ok=True)
os.makedirs("processed", exist_ok=True)
os.makedirs("output", exist_ok=True)


In [18]:
customers.to_csv("processed/customers.csv", index=False)
interactions.to_csv("processed/interactions.csv", index=False)
conversion.to_csv("processed/conversion.csv", index=False)


In [19]:
import sqlite3

conn = sqlite3.connect("output/database.sqlite")

customers.to_sql("customers", conn, if_exists="replace", index=False)
interactions.to_sql("interactions", conn, if_exists="replace", index=False)
conversion.to_sql("conversion", conn, if_exists="replace", index=False)

conn.close()


In [20]:
print("Original rows:", len(df))
print("Customer rows:", len(customers))
print("Interaction rows:", len(interactions))
print("Conversion rows:", len(conversion))


Original rows: 26145
Customer rows: 26145
Interaction rows: 26145
Conversion rows: 26145


In [21]:
import os
print(os.getcwd())


C:\Users\DELL
