In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings 
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('email_table.csv')
df.head(5)

Unnamed: 0,email_id,email_text,email_version,hour,weekday,user_country,user_past_purchases
0,85120,short_email,personalized,2,Sunday,US,5
1,966622,long_email,personalized,12,Sunday,UK,2
2,777221,long_email,personalized,11,Wednesday,US,2
3,493711,short_email,generic,6,Monday,UK,1
4,106887,long_email,generic,14,Monday,US,6


In [3]:
email_opened = pd.read_csv('email_opened_table.csv')
email_opened.head()

Unnamed: 0,email_id
0,284534
1,609056
2,220820
3,905936
4,164034


In [4]:
email_opened.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10345 entries, 0 to 10344
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   email_id  10345 non-null  int64
dtypes: int64(1)
memory usage: 80.9 KB


In [5]:
link = pd.read_csv('link_clicked_table.csv')
link.head()

Unnamed: 0,email_id
0,609056
1,870980
2,935124
3,158501
4,177561


In [6]:
link.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2119 entries, 0 to 2118
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   email_id  2119 non-null   int64
dtypes: int64(1)
memory usage: 16.7 KB


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   email_id             100000 non-null  int64 
 1   email_text           100000 non-null  object
 2   email_version        100000 non-null  object
 3   hour                 100000 non-null  int64 
 4   weekday              100000 non-null  object
 5   user_country         100000 non-null  object
 6   user_past_purchases  100000 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 5.3+ MB


In [8]:
df['email_text'].value_counts()

email_text
long_email     50276
short_email    49724
Name: count, dtype: int64

In [9]:
df['email_version'].value_counts()

email_version
generic         50209
personalized    49791
Name: count, dtype: int64

In [10]:
df['hour'].value_counts()

hour
9     8529
8     8398
7     8204
10    8180
11    7483
6     7465
5     6551
12    6508
4     5622
13    5581
3     4610
14    4580
2     3676
15    3493
16    2759
1     2703
17    1893
18    1421
19     905
20     656
21     365
22     204
23     145
24      69
Name: count, dtype: int64

In [11]:
df['weekday'].value_counts()

weekday
Saturday     14569
Sunday       14387
Monday       14363
Thursday     14277
Friday       14177
Tuesday      14143
Wednesday    14084
Name: count, dtype: int64

In [12]:
df['user_country'].value_counts()

user_country
US    60099
UK    19939
FR     9995
ES     9967
Name: count, dtype: int64

In [13]:
df['user_past_purchases'].value_counts()

user_past_purchases
0     13877
1     13751
2     13036
3     12077
4     10743
5      9042
6      7518
7      6051
8      4393
9      3296
10     2363
11     1553
12      944
13      578
14      362
15      188
16      102
17       60
18       35
19       15
20       11
21        4
22        1
Name: count, dtype: int64

## setting the target variable

In [14]:
df['opened'] = df['email_id'].isin(email_opened['email_id']).astype(int)

In [15]:
df['clicked'] = df['email_id'].isin(link['email_id']).astype(int)

In [16]:
df.head()

Unnamed: 0,email_id,email_text,email_version,hour,weekday,user_country,user_past_purchases,opened,clicked
0,85120,short_email,personalized,2,Sunday,US,5,0,0
1,966622,long_email,personalized,12,Sunday,UK,2,1,1
2,777221,long_email,personalized,11,Wednesday,US,2,0,0
3,493711,short_email,generic,6,Monday,UK,1,0,0
4,106887,long_email,generic,14,Monday,US,6,0,0


## converting categorical to numerical

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()

# Label Encoding for binary categories
le_text = LabelEncoder()
le_version = LabelEncoder()

df_encoded['email_text'] = le_text.fit_transform(df_encoded['email_text'])         # short_email = 0, long_email = 1
df_encoded['email_version'] = le_version.fit_transform(df_encoded['email_version']) # generic = 0, personalized = 1

# One-Hot Encoding for non-binary categories
df_encoded = pd.get_dummies(df_encoded, columns=['weekday', 'user_country'], drop_first=True)

# Feature columns (all except email_id, opened, clicked)
#feature_cols = [col for col in df_encoded.columns if col not in ['email_id', 'opened', 'clicked']]


In [18]:
df_encoded.head()

Unnamed: 0,email_id,email_text,email_version,hour,user_past_purchases,opened,clicked,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,user_country_FR,user_country_UK,user_country_US
0,85120,1,1,2,5,0,0,False,False,True,False,False,False,False,False,True
1,966622,0,1,12,2,1,1,False,False,True,False,False,False,False,True,False
2,777221,0,1,11,2,0,0,False,False,False,False,False,True,False,False,True
3,493711,1,0,6,1,0,0,True,False,False,False,False,False,False,True,False
4,106887,0,0,14,6,0,0,True,False,False,False,False,False,False,False,True


In [19]:
df = df_encoded
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype
---  ------               --------------   -----
 0   email_id             100000 non-null  int64
 1   email_text           100000 non-null  int64
 2   email_version        100000 non-null  int64
 3   hour                 100000 non-null  int64
 4   user_past_purchases  100000 non-null  int64
 5   opened               100000 non-null  int64
 6   clicked              100000 non-null  int64
 7   weekday_Monday       100000 non-null  bool 
 8   weekday_Saturday     100000 non-null  bool 
 9   weekday_Sunday       100000 non-null  bool 
 10  weekday_Thursday     100000 non-null  bool 
 11  weekday_Tuesday      100000 non-null  bool 
 12  weekday_Wednesday    100000 non-null  bool 
 13  user_country_FR      100000 non-null  bool 
 14  user_country_UK      100000 non-null  bool 
 15  user_country_US      100000 non-null  bool 
dtypes: 

In [20]:
## upsampling
df_minority=df[df['clicked']==1]
df_majority=df[df['clicked']==0]

from sklearn.utils import resample
df_minority_upsampled=resample(df_minority,replace=True, #Sample With replacement
         n_samples=len(df_majority),
         random_state=42
        )

df_upsampled = pd.concat([df_majority,df_minority_upsampled])


In [21]:
df = df_upsampled

In [33]:
df.shape

(195762, 16)

In [23]:
feature_cols = [col for col in df_encoded.columns if col not in ['email_id', 'opened', 'clicked']]


In [24]:
feature_cols

['email_text',
 'email_version',
 'hour',
 'user_past_purchases',
 'weekday_Monday',
 'weekday_Saturday',
 'weekday_Sunday',
 'weekday_Thursday',
 'weekday_Tuesday',
 'weekday_Wednesday',
 'user_country_FR',
 'user_country_UK',
 'user_country_US']

In [25]:

# X and y
X = df[feature_cols]
y = df['clicked']

# Split the data (stratify to keep 1/0 click ratio balanced)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [26]:
X.shape, y.shape

((195762, 13), (195762,))

# logistic

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)
y_prob = log_reg.predict_proba(X_test)[:, 1]  # Probabilities for the ROC AUC score

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))


Accuracy: 0.6787219370163206
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.69      0.68     19558
           1       0.68      0.67      0.68     19595

    accuracy                           0.68     39153
   macro avg       0.68      0.68      0.68     39153
weighted avg       0.68      0.68      0.68     39153

ROC AUC Score: 0.7438828408934676


In [31]:
#Predict train data
from sklearn.model_selection import cross_val_score

ypred_train = log_reg.predict(X_train)
print('Train Accuracy :', accuracy_score(y_train,ypred_train))

print('CV Score : ', cross_val_score(log_reg, X_train,y_train, cv=5, scoring='accuracy').mean())

#Predict & Evaluation on test data
ypred_test = log_reg.predict(X_test)
print('Test Accuracy : ', accuracy_score(y_test, ypred_test))

Train Accuracy : 0.6728668211916301
CV Score :  0.6726050268460895
Test Accuracy :  0.6787219370163206


In [32]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,ypred_test))

[[13461  6097]
 [ 6482 13113]]


In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Define the parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Type of regularization
    'solver': ['liblinear', 'lbfgs', 'newton-cg']  # Optimization algorithms
}

# Create the Logistic Regression model
logreg = LogisticRegression(class_weight='balanced', max_iter=1000)

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5, scoring='roc_auc')

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model to make predictions on the test set
best_logreg = grid_search.best_estimator_
y_pred = best_logreg.predict(X_test)
y_prob = best_logreg.predict_proba(X_test)[:, 1]  # For ROC AUC

# Evaluate the model

print("Accuracy:", accuracy_score(y_test, y_pred))

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC AUC Score:", roc_auc_score(y_test, y_prob))


Best Hyperparameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy: 0.6799223558858836
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.69      0.68     19558
           1       0.68      0.67      0.68     19595

    accuracy                           0.68     39153
   macro avg       0.68      0.68      0.68     39153
weighted avg       0.68      0.68      0.68     39153

ROC AUC Score: 0.7441583256360046
