In [1]:
import sys
import seaborn as sns
import matplotlib.pyplot as plt
sys.path.append("..")
from src.data_loader import load_data
df = load_data("../Data/email_table.csv", "../Data/email_opened_table.csv", "../Data/link_clicked_table.csv")
df.head()


Unnamed: 0,email_id,email_text,email_version,hour,weekday,user_country,user_past_purchases,opened,clicked
0,85120,short_email,personalized,2,Sunday,US,5,0,0
1,966622,long_email,personalized,12,Sunday,UK,2,1,1
2,777221,long_email,personalized,11,Wednesday,US,2,0,0
3,493711,short_email,generic,6,Monday,UK,1,0,0
4,106887,long_email,generic,14,Monday,US,6,0,0


In [2]:
print("Email Table Info:")
print(df.info())
print("\nMissing Values in Email Table:")
print(df.isnull().sum())
print("\nDuplicates in Email Table:")
print(df['email_id'].duplicated().sum())
print("\nHour Range:", df['hour'].min(), "to", df['hour'].max())
print("User Past Purchases Range:", df['user_past_purchases'].min(), "to", df['user_past_purchases'].max())
print("Unique Values in Categorical Columns:")
print("Email Text:", df['email_text'].unique())
print("Email Version:", df['email_version'].unique())
print("Weekday:", df['weekday'].unique())
print("User Country:", df['user_country'].unique())

Email Table Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   email_id             100000 non-null  int64 
 1   email_text           100000 non-null  object
 2   email_version        100000 non-null  object
 3   hour                 100000 non-null  int64 
 4   weekday              100000 non-null  object
 5   user_country         100000 non-null  object
 6   user_past_purchases  100000 non-null  int64 
 7   opened               100000 non-null  int64 
 8   clicked              100000 non-null  int64 
dtypes: int64(5), object(4)
memory usage: 6.9+ MB
None

Missing Values in Email Table:
email_id               0
email_text             0
email_version          0
hour                   0
weekday                0
user_country           0
user_past_purchases    0
opened                 0
clicked                0
dtype: int6

In [3]:
print("\nOpen and Click Counts:")
print("Emails Opened:", df['opened'].sum())
print("Links Clicked:", df['clicked'].sum())


Open and Click Counts:
Emails Opened: 10345
Links Clicked: 2119


In [4]:
print("\nBasic Statistics:")
print("Mean User Past Purchases:", df['user_past_purchases'].mean())
print("Hour Distribution:\n", df['hour'].describe())
print("Categorical Frequencies:")
print("Email Text:\n", df['email_text'].value_counts())
print("Email Version:\n", df['email_version'].value_counts())
print("Weekday:\n", df['weekday'].value_counts())
print("User Country:\n", df['user_country'].value_counts())


Basic Statistics:
Mean User Past Purchases: 3.87845
Hour Distribution:
 count    100000.000000
mean          9.059300
std           4.439637
min           1.000000
25%           6.000000
50%           9.000000
75%          12.000000
max          24.000000
Name: hour, dtype: float64
Categorical Frequencies:
Email Text:
 email_text
long_email     50276
short_email    49724
Name: count, dtype: int64
Email Version:
 email_version
generic         50209
personalized    49791
Name: count, dtype: int64
Weekday:
 weekday
Saturday     14569
Sunday       14387
Monday       14363
Thursday     14277
Friday       14177
Tuesday      14143
Wednesday    14084
Name: count, dtype: int64
User Country:
 user_country
US    60099
UK    19939
FR     9995
ES     9967
Name: count, dtype: int64


In [5]:
total_emails = len(df)
open_rate = (df['opened'].sum() / total_emails) * 100
ctr = (df['clicked'].sum() / total_emails) * 100

print(f"Open Rate: {open_rate:.2f}%")
print(f"Click-Through Rate: {ctr:.2f}%")

Open Rate: 10.35%
Click-Through Rate: 2.12%


In [6]:
print(df.columns)


Index(['email_id', 'email_text', 'email_version', 'hour', 'weekday',
       'user_country', 'user_past_purchases', 'opened', 'clicked'],
      dtype='object')


To understand whether various categorical email campaign segments significantly affect user behavior,I conducted Chi-Square tests of independence.

In [7]:
from scipy.stats import chi2_contingency
import pandas as pd
# Function to perform chi-square test
def chi_square_test(df, segment, target):
    contingency_table = pd.crosstab(df[segment], df[target])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    return p

# Test significance for key segments
for seg in ['email_version', 'email_text', 'weekday', 'user_country']:
    p_open = chi_square_test(df, seg, 'opened')
    p_click = chi_square_test(df, seg, 'clicked')
    print(f"\nChi-Square Test for {seg}:")
    print(f"Open Rate: p-value = {p_open:.4f}")
    print(f"CTR: p-value = {p_click:.4f}")
    print(f"Significant for Open Rate: {'Yes' if p_open < 0.05 else 'No'}")
    print(f"Significant for CTR: {'Yes' if p_click < 0.05 else 'No'}")


Chi-Square Test for email_version:
Open Rate: p-value = 0.0000
CTR: p-value = 0.0000
Significant for Open Rate: Yes
Significant for CTR: Yes

Chi-Square Test for email_text:
Open Rate: p-value = 0.0000
CTR: p-value = 0.0000
Significant for Open Rate: Yes
Significant for CTR: Yes

Chi-Square Test for weekday:
Open Rate: p-value = 0.0000
CTR: p-value = 0.0000
Significant for Open Rate: Yes
Significant for CTR: Yes

Chi-Square Test for user_country:
Open Rate: p-value = 0.0000
CTR: p-value = 0.0000
Significant for Open Rate: Yes
Significant for CTR: Yes


All tested segments show strong statistical significance (p < 0.05) for both open rates and CTRs. This suggests that factors like the email version, text content, weekday sent, and user location have a meaningful influence.