In [3]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta


In [4]:
# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)

# Number of rows/sessions to generate
n_rows = 30000

# Pre-generate a pool of unique user IDs (simulate that some visitors have multiple sessions)
n_unique_users = 15000
user_ids = [f"U{str(i).zfill(5)}" for i in range(1, n_unique_users + 1)]

# Pre-define locales for names: US (American), IN (Indian), FR (French), JA (Japanese), ZH (Chinese)
locales = ['en_US', 'en_IN', 'fr_FR', 'ja_JP', 'zh_CN']
faker_objs = {loc: Faker(loc) for loc in locales}

# Email providers list
email_providers = ["gmail.com", "hotmail.com", "outlook.com", "yahoo.com", "protonmail.com"]

# Pre-define some city-country pairs from around the world
locations = [
    ("New York", "USA"),
    ("Los Angeles", "USA"),
    ("Chicago", "USA"),
    ("London", "UK"),
    ("Manchester", "UK"),
    ("Paris", "France"),
    ("Lyon", "France"),
    ("Mumbai", "India"),
    ("Delhi", "India"),
    ("Bangalore", "India"),
    ("Tokyo", "Japan"),
    ("Osaka", "Japan"),
    ("Sydney", "Australia"),
    ("Melbourne", "Australia"),
    ("Toronto", "Canada"),
    ("Vancouver", "Canada"),
    ("Berlin", "Germany"),
    ("Munich", "Germany"),
    ("Singapore", "Singapore"),
    ("Dubai", "UAE"),
    ("Hong Kong", "Hong Kong"),
    ("Rome", "Italy"),
    ("Madrid", "Spain"),
    ("Moscow", "Russia"),
    ("Beijing", "China"),
    ("Shanghai", "China")
]

# Device types and their probabilities
device_types = ["Mobile", "Desktop", "Tablet"]
device_probs = [0.6, 0.3, 0.1]

# Variant groups (landing page designs)
variant_groups = ["Vibrant", "Cold", "Heat"]

# Conversion probabilities by variant group
conversion_probs = {"Vibrant": 0.12, "Cold": 0.18, "Heat": 0.15}

# Sign in types with probabilities
sign_in_types = ["Email", "Guest"]
sign_in_probs = [0.7, 0.3]

# Demographic gender choices with probabilities
genders = ["Male", "Female", "NotAnswered"]
gender_probs = [0.45, 0.45, 0.10]

# Traffic sources and probabilities
traffic_sources = ["Organic", "Paid", "Social", "Referral"]
traffic_probs = [0.5, 0.2, 0.2, 0.1]

# Payment types and card types (only applicable if a Purchase is made)
payment_types = ["Card", "COD"]
payment_probs = [0.7, 0.3]
card_types = ["Amex", "Visa", "Master"]

# Product list for speakers – each is a dict with product name and price
products = [
    {"product": "JBL Flip 6", "price": 129.95},
    {"product": "JBL Charge 5", "price": 179.95},
    {"product": "JBL Go 3", "price": 49.95},
    {"product": "JBL Clip 4", "price": 79.95},
    {"product": "JBL Xtreme 3", "price": 379.95},
    {"product": "JBL Boombox 3", "price": 499.95},
    {"product": "JBL Pulse 5", "price": 249.95},
    {"product": "Sony SRS-XB23", "price": 99.99},
    {"product": "Sony SRS-XB33", "price": 149.99},
    {"product": "Sony SRS-XB43", "price": 249.99},
    {"product": "Sony SRS-XB13", "price": 59.99},
    {"product": "Sony SRS-XE200", "price": 129.99},
    {"product": "Sony SRS-XE300", "price": 199.99},
    {"product": "Sony SRS-XG300", "price": 249.99},
    {"product": "Bose SoundLink Flex", "price": 149.00},
    {"product": "Bose SoundLink Revolve+ II", "price": 329.00},
    {"product": "Bose SoundLink Micro", "price": 119.00},
    {"product": "Bose SoundLink Revolve II", "price": 199.00},
    {"product": "Ultimate Ears Boom 3", "price": 149.99},
    {"product": "Ultimate Ears Megaboom 3", "price": 199.99},
    {"product": "Ultimate Ears Wonderboom 3", "price": 99.99},
    {"product": "Ultimate Ears Hyperboom", "price": 449.99},
    {"product": "Ultimate Ears Boom 2", "price": 129.99},
    {"product": "Ultimate Ears Roll 2", "price": 99.99},
    {"product": "Anker Soundcore Motion+", "price": 99.99},
    {"product": "Anker Soundcore Motion Boom", "price": 99.99},
    {"product": "Anker Soundcore 2", "price": 39.99},
    {"product": "Anker Soundcore 3", "price": 49.99},
    {"product": "Anker Soundcore Flare 2", "price": 79.99},
    {"product": "Anker Soundcore Rave Neo", "price": 119.99},
    {"product": "Anker Soundcore Motion Boom Plus", "price": 149.99},
    {"product": "Tribit StormBox Micro 2", "price": 59.99},
    {"product": "Tribit StormBox Pro", "price": 99.99},
    {"product": "Tribit XSound Go", "price": 35.99},
    {"product": "Tribit MaxSound Plus", "price": 59.99},
    {"product": "Tribit StormBox", "price": 79.99},
    {"product": "Tribit XSound Mega", "price": 69.99},
    {"product": "Tribit StormBox Blast", "price": 159.99},
    {"product": "Marshall Emberton", "price": 149.99},
    {"product": "Marshall Stockwell II", "price": 299.99},
    {"product": "Marshall Kilburn II", "price": 299.99},
    {"product": "Marshall Tufton", "price": 449.99},
    {"product": "Marshall Middleton", "price": 299.99},
    {"product": "Marshall Willen", "price": 119.99},
    {"product": "Marshall Emberton II", "price": 169.99},
    {"product": "Bang & Olufsen Beosound A1 (2nd Gen)", "price": 250.00},
    {"product": "Bang & Olufsen Beosound A5", "price": 1099.00},
    {"product": "Bang & Olufsen Beosound Explore", "price": 199.00},
    {"product": "Bang & Olufsen Beosound A9", "price": 2499.00},
    {"product": "Bang & Olufsen Beosound Emerge", "price": 799.00},
    {"product": "Harman Kardon Onyx Studio 7", "price": 299.95},
    {"product": "Harman Kardon Onyx Studio 8", "price": 349.95},
    {"product": "Harman Kardon Go + Play", "price": 499.95},
    {"product": "Harman Kardon Aura Studio 3", "price": 349.95},
    {"product": "Harman Kardon Esquire Mini 2", "price": 149.95},
    {"product": "Sennheiser Momentum Wireless", "price": 499.95},
    {"product": "Sennheiser CX True Wireless", "price": 99.95},
    {"product": "Sennheiser AMBEO Smart Speaker", "price": 1999.95},
    {"product": "Sennheiser HD 250BT", "price": 199.95},
    {"product": "Sennheiser RS 175", "price": 299.95},
    {"product": "LG XBOOM Go PL7", "price": 199.99},
    {"product": "LG XBOOM Go PL5", "price": 149.99},
    {"product": "LG XBOOM Go PK7", "price": 299.99},
    {"product": "LG XBOOM Go PK5", "price": 199.99},
    {"product": "LG XBOOM Go XG7", "price": 399.99},
    {"product": "LG XBOOM Go XG5", "price": 299.99},
    {"product": "LG XBOOM Go XG3", "price": 199.99},
    {"product": "LG XBOOM Go XG2", "price": 149.99},
    {"product": "LG XBOOM Go XG1", "price": 99.99},
    {"product": "LG XBOOM Go XG0", "price": 79.99},
    {"product": "Philips S7505", "price": 50.00},
    {"product": "Philips S75015", "price": 50.00},
    {"product": "Philips S75029", "price": 200.00}
]

# Define the start date for sessions (from January 1, 2025) up to now
start_date = datetime(2025, 1, 1)
end_date = datetime.now()

def random_timestamp(start, end):
    """Return a random datetime between start and end."""
    return Faker().date_time_between(start_date=start, end_date=end)

# Prepare list to collect all rows
data = []

for i in range(n_rows):
    # Session and user details
    session_id = f"S{str(i+1).zfill(6)}"
    user_id = random.choice(user_ids)
    sign_in = random.choices(sign_in_types, weights=sign_in_probs, k=1)[0]
    
    # Choose a random locale for the name and generate name
    loc = random.choice(locales)
    fake = faker_objs[loc]
    name = fake.name()
    
    # Generate email based on name (if sign_in is Email) else leave blank
    name_str = ''.join(e for e in name if e.isalnum()).lower()
    if sign_in == "Email":
        email = f"{name_str}{random.randint(1,999)}@{random.choice(email_providers)}"
    else:
        email = ""  # Guest users have no registered email
    
    # Demographics
    demographic_age = random.randint(14, 80)
    demographic_gender = random.choices(genders, weights=gender_probs, k=1)[0]
    
    # Determine age group based on demographic_age
    if demographic_age < 20:
        age_group = "Teenage"
    elif demographic_age < 65:
        age_group = "Adult"
    else:
        age_group = "Old"
    
    # Location and Country (choose from predefined pairs)
    loc_choice = random.choice(locations)
    city, country = loc_choice[0], loc_choice[1]
    
    # Device and timestamp
    device_type = random.choices(device_types, weights=device_probs, k=1)[0]
    timestamp = random_timestamp(start_date, end_date)
    
    # Variant group (landing page design)
    variant_group = random.choice(variant_groups)
    
    # Engagement metrics
    time_spent = round(random.uniform(1, 20), 2)  # minutes
    pages_visited = random.randint(1, 10)
    
    # Determine conversion_flag based on variant conversion probability
    conv_prob = conversion_probs[variant_group]
    conversion_flag = np.random.binomial(1, conv_prob)
    
    # Determine conversion_type (if conversion occurred)
    conversion_type = ""
    if conversion_flag:
        if sign_in == "Email":
            # Registered users can either Signup or Purchase
            conversion_type = random.choice(["Signup", "Purchase"])
        else:
            # Guest users can only Purchase
            conversion_type = "Purchase"
    
    # Traffic source
    traffic_source = random.choices(traffic_sources, weights=traffic_probs, k=1)[0]
    
    # Initialize Purchase-related fields
    product_Purchased = ""
    revenue = 0.0
    payment_type = ""
    card_type_val = ""
    coupon_applied = ""
    
    if conversion_flag and conversion_type == "Purchase":
        # Randomly pick a product; now only include the product name (without the price)
        product_choice = random.choice(products)
        product_Purchased = product_choice['product']
        revenue = product_choice['price']
        # Payment details
        payment_type = random.choices(payment_types, weights=payment_probs, k=1)[0]
        if payment_type == "Card":
            card_type_val = random.choice(card_types)
        # Coupon applied: yes with 30% chance
        coupon_applied = random.choices(["Yes", "No"], weights=[0.3, 0.7], k=1)[0]
    
    # Bounce flag: if only one page visited then bounce else 10% chance even if >1 page
    bounce_flag = 1 if pages_visited == 1 else (np.random.binomial(1, 0.1))
    
    # Append the row dictionary
    data.append({
        "user_id": user_id,
        "session_id": session_id,
        "sign_in": sign_in,
        "name": name,
        "demographic_age": demographic_age,
        "demographic_age_group": age_group,
        "demographic_gender": demographic_gender,
        "email": email,
        "Location": city,
        "Country": country,
        "device_type": device_type,
        "timestamp": timestamp,
        "variant_group": variant_group,
        "time_spent": time_spent,
        "pages_visited": pages_visited,
        "conversion_flag": conversion_flag,
        "conversion_type": conversion_type,
        "traffic_source": traffic_source,
        "product_Purchased": product_Purchased,
        "revenue": revenue,
        "payment_type": payment_type,
        "card_type": card_type_val,
        "Coupon_applied": coupon_applied,
        "bounce_flag": bounce_flag
    })

# Create DataFrame
df = pd.DataFrame(data)

# # Save to CSV and Excel
# df.to_csv("synthetic_website_conversion_data.csv", index=False)
# df.to_excel("synthetic_website_conversion_data.xlsx", index=False)

# print("Dataset with 30,000 rows generated and saved as CSV and Excel files.")

In [5]:
df


Unnamed: 0,user_id,session_id,sign_in,name,demographic_age,demographic_age_group,demographic_gender,email,Location,Country,...,pages_visited,conversion_flag,conversion_type,traffic_source,product_Purchased,revenue,payment_type,card_type,Coupon_applied,bounce_flag
0,U10477,S000001,Email,David Lévêque,31,Adult,Female,davidlévêque251@hotmail.com,Rome,Italy,...,7,0,,Organic,,0.00,,,,1
1,U01536,S000002,Email,苏鹏,39,Adult,Female,苏鹏617@gmail.com,Madrid,Spain,...,5,0,,Social,,0.00,,,,0
2,U00107,S000003,Guest,Kala Chaudhry,68,Old,Male,,Manchester,UK,...,7,0,,Organic,,0.00,,,,0
3,U13886,S000004,Email,Émile Delaunay-Antoine,72,Old,Female,émiledelaunayantoine827@gmail.com,Sydney,Australia,...,10,0,,Social,,0.00,,,,0
4,U05926,S000005,Email,Heather Skinner,51,Adult,NotAnswered,heatherskinner47@hotmail.com,Mumbai,India,...,6,0,,Organic,,0.00,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,U08032,S029996,Email,Xavier Mani,15,Teenage,Male,xaviermani401@outlook.com,Sydney,Australia,...,1,1,Signup,Social,,0.00,,,,1
29996,U10440,S029997,Email,Henri-Éric Georges,51,Adult,Female,henriéricgeorges443@protonmail.com,Hong Kong,Hong Kong,...,4,0,,Social,,0.00,,,,0
29997,U07125,S029998,Email,Jesse Merritt,15,Teenage,Female,jessemerritt655@gmail.com,Tokyo,Japan,...,2,0,,Social,,0.00,,,,0
29998,U03820,S029999,Email,齐兰英,73,Old,Female,齐兰英771@gmail.com,Singapore,Singapore,...,8,1,Purchase,Paid,Sony SRS-XB43,249.99,COD,,No,0


In [6]:
df.head(10)

Unnamed: 0,user_id,session_id,sign_in,name,demographic_age,demographic_age_group,demographic_gender,email,Location,Country,...,pages_visited,conversion_flag,conversion_type,traffic_source,product_Purchased,revenue,payment_type,card_type,Coupon_applied,bounce_flag
0,U10477,S000001,Email,David Lévêque,31,Adult,Female,davidlévêque251@hotmail.com,Rome,Italy,...,7,0,,Organic,,0.0,,,,1
1,U01536,S000002,Email,苏鹏,39,Adult,Female,苏鹏617@gmail.com,Madrid,Spain,...,5,0,,Social,,0.0,,,,0
2,U00107,S000003,Guest,Kala Chaudhry,68,Old,Male,,Manchester,UK,...,7,0,,Organic,,0.0,,,,0
3,U13886,S000004,Email,Émile Delaunay-Antoine,72,Old,Female,émiledelaunayantoine827@gmail.com,Sydney,Australia,...,10,0,,Social,,0.0,,,,0
4,U05926,S000005,Email,Heather Skinner,51,Adult,NotAnswered,heatherskinner47@hotmail.com,Mumbai,India,...,6,0,,Organic,,0.0,,,,0
5,U05821,S000006,Email,Michelle Tanguy,35,Adult,Female,michelletanguy719@gmail.com,Mumbai,India,...,9,0,,Organic,,0.0,,,,1
6,U05314,S000007,Guest,Robert Bernard,43,Adult,Female,,Shanghai,China,...,10,1,Purchase,Social,Marshall Kilburn II,299.99,Card,Visa,No,0
7,U14984,S000008,Email,Upasna Yohannan,45,Adult,Female,upasnayohannan272@hotmail.com,Munich,Germany,...,10,0,,Organic,,0.0,,,,0
8,U03594,S000009,Guest,Reva Cherian,79,Old,Female,,Beijing,China,...,3,0,,Social,,0.0,,,,0
9,U06917,S000010,Email,藤原 翼,73,Old,Female,藤原翼391@protonmail.com,Munich,Germany,...,2,0,,Paid,,0.0,,,,0


In [7]:
df.columns


Index(['user_id', 'session_id', 'sign_in', 'name', 'demographic_age',
       'demographic_age_group', 'demographic_gender', 'email', 'Location',
       'Country', 'device_type', 'timestamp', 'variant_group', 'time_spent',
       'pages_visited', 'conversion_flag', 'conversion_type', 'traffic_source',
       'product_Purchased', 'revenue', 'payment_type', 'card_type',
       'Coupon_applied', 'bounce_flag'],
      dtype='object')

In [8]:
df.shape


(30000, 24)

In [9]:
df.dtypes

user_id                          object
session_id                       object
sign_in                          object
name                             object
demographic_age                   int64
demographic_age_group            object
demographic_gender               object
email                            object
Location                         object
Country                          object
device_type                      object
timestamp                datetime64[ns]
variant_group                    object
time_spent                      float64
pages_visited                     int64
conversion_flag                   int64
conversion_type                  object
traffic_source                   object
product_Purchased                object
revenue                         float64
payment_type                     object
card_type                        object
Coupon_applied                   object
bounce_flag                       int64
dtype: object

In [10]:
df.isnull().sum()

user_id                  0
session_id               0
sign_in                  0
name                     0
demographic_age          0
demographic_age_group    0
demographic_gender       0
email                    0
Location                 0
Country                  0
device_type              0
timestamp                0
variant_group            0
time_spent               0
pages_visited            0
conversion_flag          0
conversion_type          0
traffic_source           0
product_Purchased        0
revenue                  0
payment_type             0
card_type                0
Coupon_applied           0
bounce_flag              0
dtype: int64

In [11]:
df.duplicated().sum()

0

In [12]:
print(df["email"].unique())

['davidlévêque251@hotmail.com' '苏鹏617@gmail.com' '' ...
 'jessemerritt655@gmail.com' '齐兰英771@gmail.com' '梅冬梅735@yahoo.com']


In [51]:
df["email"] = df["email"].replace(["", "NULL", "-", "N/A"], pd.NA)


In [53]:
print(df["email"].unique())

['davidlévêque251@hotmail.com' '苏鹏617@gmail.com' <NA> ...
 'jessemerritt655@gmail.com' '齐兰英771@gmail.com' '梅冬梅735@yahoo.com']


In [55]:
df.isnull().sum()

user_id                     0
session_id                  0
sign_in                     0
name                        0
demographic_age             0
demographic_age_group       0
demographic_gender          0
email                    9025
Location                    0
Country                     0
device_type                 0
timestamp                   0
variant_group               0
time_spent                  0
pages_visited               0
conversion_flag             0
conversion_type             0
traffic_source              0
product_Purchased           0
revenue                     0
payment_type                0
card_type                   0
Coupon_applied              0
bounce_flag                 0
dtype: int64

In [57]:
df["conversion_type"] = df["conversion_type"].replace(["", "NULL", "-", "N/A"], pd.NA)

In [59]:
df.isnull().sum()

user_id                      0
session_id                   0
sign_in                      0
name                         0
demographic_age              0
demographic_age_group        0
demographic_gender           0
email                     9025
Location                     0
Country                      0
device_type                  0
timestamp                    0
variant_group                0
time_spent                   0
pages_visited                0
conversion_flag              0
conversion_type          25465
traffic_source               0
product_Purchased            0
revenue                      0
payment_type                 0
card_type                    0
Coupon_applied               0
bounce_flag                  0
dtype: int64

In [61]:
df["product_Purchased"] = df["product_Purchased"].replace(["", "NULL", "-", "N/A"], pd.NA)

In [63]:
df.isnull().sum()

user_id                      0
session_id                   0
sign_in                      0
name                         0
demographic_age              0
demographic_age_group        0
demographic_gender           0
email                     9025
Location                     0
Country                      0
device_type                  0
timestamp                    0
variant_group                0
time_spent                   0
pages_visited                0
conversion_flag              0
conversion_type          25465
traffic_source               0
product_Purchased        27008
revenue                      0
payment_type                 0
card_type                    0
Coupon_applied               0
bounce_flag                  0
dtype: int64

In [75]:
df["card_type"] = df["card_type"].replace(["", "NULL", "-", "N/A"], pd.NA)
df["payment_type"] = df["payment_type"].replace(["", "NULL", "-", "N/A"], pd.NA)
df["Coupon_applied"] = df["Coupon_applied"].replace(["", "NULL", "-", "N/A"], pd.NA)

In [77]:
df.isnull().sum()

user_id                      0
session_id                   0
sign_in                      0
name                         0
demographic_age              0
demographic_age_group        0
demographic_gender           0
email                     9025
Location                     0
Country                      0
device_type                  0
timestamp                    0
variant_group                0
time_spent                   0
pages_visited                0
conversion_flag              0
conversion_type          25465
traffic_source               0
product_Purchased        27008
revenue                      0
payment_type             27008
card_type                27905
Coupon_applied           27008
bounce_flag                  0
card_type                27905
dtype: int64

In [81]:
del df['card_type ']

In [83]:
df.isnull().sum()

user_id                      0
session_id                   0
sign_in                      0
name                         0
demographic_age              0
demographic_age_group        0
demographic_gender           0
email                     9025
Location                     0
Country                      0
device_type                  0
timestamp                    0
variant_group                0
time_spent                   0
pages_visited                0
conversion_flag              0
conversion_type          25465
traffic_source               0
product_Purchased        27008
revenue                      0
payment_type             27008
card_type                27905
Coupon_applied           27008
bounce_flag                  0
dtype: int64

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   user_id                30000 non-null  object        
 1   session_id             30000 non-null  object        
 2   sign_in                30000 non-null  object        
 3   name                   30000 non-null  object        
 4   demographic_age        30000 non-null  int64         
 5   demographic_age_group  30000 non-null  object        
 6   demographic_gender     30000 non-null  object        
 7   email                  20975 non-null  object        
 8   Location               30000 non-null  object        
 9   Country                30000 non-null  object        
 10  device_type            30000 non-null  object        
 11  timestamp              30000 non-null  datetime64[ns]
 12  variant_group          30000 non-null  object        
 13  t

In [87]:
df.head(10)

Unnamed: 0,user_id,session_id,sign_in,name,demographic_age,demographic_age_group,demographic_gender,email,Location,Country,...,pages_visited,conversion_flag,conversion_type,traffic_source,product_Purchased,revenue,payment_type,card_type,Coupon_applied,bounce_flag
0,U10477,S000001,Email,David Lévêque,31,Adult,Female,davidlévêque251@hotmail.com,Rome,Italy,...,7,0,,Organic,,0.0,,,,1
1,U01536,S000002,Email,苏鹏,39,Adult,Female,苏鹏617@gmail.com,Madrid,Spain,...,5,0,,Social,,0.0,,,,0
2,U00107,S000003,Guest,Kala Chaudhry,68,Old,Male,,Manchester,UK,...,7,0,,Organic,,0.0,,,,0
3,U13886,S000004,Email,Émile Delaunay-Antoine,72,Old,Female,émiledelaunayantoine827@gmail.com,Sydney,Australia,...,10,0,,Social,,0.0,,,,0
4,U05926,S000005,Email,Heather Skinner,51,Adult,NotAnswered,heatherskinner47@hotmail.com,Mumbai,India,...,6,0,,Organic,,0.0,,,,0
5,U05821,S000006,Email,Michelle Tanguy,35,Adult,Female,michelletanguy719@gmail.com,Mumbai,India,...,9,0,,Organic,,0.0,,,,1
6,U05314,S000007,Guest,Robert Bernard,43,Adult,Female,,Shanghai,China,...,10,1,Purchase,Social,Marshall Kilburn II,299.99,Card,Visa,No,0
7,U14984,S000008,Email,Upasna Yohannan,45,Adult,Female,upasnayohannan272@hotmail.com,Munich,Germany,...,10,0,,Organic,,0.0,,,,0
8,U03594,S000009,Guest,Reva Cherian,79,Old,Female,,Beijing,China,...,3,0,,Social,,0.0,,,,0
9,U06917,S000010,Email,藤原 翼,73,Old,Female,藤原翼391@protonmail.com,Munich,Germany,...,2,0,,Paid,,0.0,,,,0
