## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [108]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings('ignore')

In [121]:
df_train=pd.read_csv('data/train_F3fUq2S.csv')
df_test=pd.read_csv('data/test_Bk2wfZ3.csv')

In [26]:
df_train.head(5)

Unnamed: 0,campaign_id,sender,subject_len,body_len,mean_paragraph_len,day_of_week,is_weekend,times_of_day,category,product,...,is_image,is_personalised,is_quote,is_timer,is_emoticons,is_discount,is_price,is_urgency,target_audience,click_rate
0,1,3,76,10439,39,5,1,Noon,6,26,...,0,0,0,0,0,0,0,0,14,0.103079
1,2,3,54,2570,256,5,1,Morning,2,11,...,0,0,0,0,0,0,0,0,10,0.7
2,3,3,59,12801,16,5,1,Noon,2,11,...,1,0,1,0,0,0,0,0,16,0.002769
3,4,3,74,11037,30,4,0,Evening,15,9,...,0,0,0,0,0,0,0,0,10,0.010868
4,5,3,80,10011,27,5,1,Noon,6,26,...,0,0,1,0,0,0,0,0,14,0.142826


### Data Preprocessing

From the  data analysis made we came to know that some features are dominating in train dataset.
Also if we can check statistically those features having very correlation with target features.
Those features can ve dropeed from train and test data set.

In [122]:
df_train.drop(['is_personalised','is_timer','is_emoticons','is_discount','is_price','is_urgency','sender','campaign_id'], axis =1, inplace = True)
df_test.drop(['is_personalised','is_timer','is_emoticons','is_discount','is_price','is_urgency','sender','campaign_id'], axis =1, inplace = True)

In [123]:
def handle_outliers(df,feature):

    IQR=(df[feature].quantile(0.75))-(df[feature].quantile(0.25))
    lower_limit=df[feature].quantile(0.25)-(IQR * 1.5)
    upper_limit=df[feature].quantile(0.75)+(IQR * 1.5)

    df_train.loc[df[feature]>=upper_limit,feature]=upper_limit
    df_train.loc[df[feature]<=lower_limit,feature]=lower_limit

    return df[feature]

In [124]:
outlier_features=['no_of_CTA','mean_paragraph_len','body_len','subject_len']

for i in range(len(outlier_features)):
    handle_outliers(df_train,outlier_features[i])

In [125]:
import category_encoders as ce


def target_encoders(feature):
    
    encoder=ce.TargetEncoder(cols=feature) 
    encoder.fit(df_train[feature],df_train['click_rate'])

    df_train[feature]= encoder.transform(df_train[feature])
    df_test[feature]= encoder.transform(df_test[feature])


In [126]:
target_encoding_features=['product','category','target_audience']

for i in range(len(target_encoding_features)):
    target_encoders(target_encoding_features[i])


In [128]:
def onehot_encoders(feature, train_data, test_data):
    encoder_ = ce.OneHotEncoder(cols=[feature],handle_unknown='ignore')
    train_encoded = encoder_.fit_transform(train_data)
    print(train_encoded)
    # test_encoded = encoder_.transform(test_data)

    # Update the original DataFrames with the encoded values
    train_data.drop(columns=[feature], inplace=True)
    train_data[encoder_.get_feature_names_out([feature])] = train_encoded

# Apply one-hot encoding to both training and test sets
onehot_encoding_features = ['day_of_week', 'times_of_day']

for feature in onehot_encoding_features:
    onehot_encoders(feature, df_train, df_test)

      subject_len  body_len  mean_paragraph_len  day_of_week_1  day_of_week_2  \
0            76.0     10439                39.0              1              0   
1            54.0      2570                68.5              1              0   
2            59.0     12801                16.0              1              0   
3            74.0     11037                30.0              0              1   
4            80.0     10011                27.0              1              0   
...           ...       ...                 ...            ...            ...   
1883         88.0      1451                68.5              0              0   
1884         58.0     10537                40.0              0              0   
1885         89.0     11050                26.0              0              0   
1886         58.0     10537                40.0              0              0   
1887         89.0     11050                26.0              0              0   

      day_of_week_3  day_of

In [129]:
df_train

Unnamed: 0,subject_len,body_len,mean_paragraph_len,is_weekend,category,product,no_of_CTA,mean_CTA_len,is_image,is_quote,...,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7,times_of_day_1,times_of_day_2,times_of_day_3
0,76.0,10439,39.0,1,0.067375,0.060249,3.0,29,0,0,...,1,0,0,0,0,0,0,1,0,0
1,54.0,2570,68.5,1,0.079360,0.079360,0.0,22,0,0,...,1,0,0,0,0,0,0,0,1,0
2,59.0,12801,16.0,1,0.079360,0.079360,3.0,23,1,1,...,1,0,0,0,0,0,0,1,0,0
3,74.0,11037,30.0,0,0.023565,0.018442,4.0,24,0,0,...,0,1,0,0,0,0,0,0,0,1
4,80.0,10011,27.0,1,0.067375,0.060249,3.0,31,0,1,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1883,88.0,1451,68.5,0,0.079360,0.079360,0.0,22,0,1,...,0,0,0,1,0,0,0,1,0,0
1884,58.0,10537,40.0,0,0.079360,0.079360,5.0,27,0,0,...,0,0,0,1,0,0,0,0,0,1
1885,89.0,11050,26.0,0,0.023565,0.018442,4.0,28,0,0,...,0,0,0,0,1,0,0,0,0,1
1886,58.0,10537,40.0,0,0.079360,0.079360,5.0,27,0,0,...,0,0,0,0,1,0,0,0,0,1


#### Model Training-:


In [130]:
X=df_train.drop(['click_rate'] ,axis=1)
Y=df_train['click_rate']

In [131]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid=train_test_split(X,Y,random_state=42,test_size=0.2)

In [132]:
print('Train', X_train.shape, y_train.shape)
print('Validation', X_valid.shape, y_valid.shape)

Train (1510, 21) (1510,)
Validation (378, 21) (378,)


In [133]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

X_train=scaler.fit_transform(X_train)

X_valid=scaler.transform(X_valid)

In [134]:
X_train

array([[-0.6215672 ,  1.78139195, -0.98050813, ..., -0.55848249,
        -0.25471575,  0.65258981],
       [-1.24797418,  0.29026212, -0.98050813, ..., -0.55848249,
        -0.25471575,  0.65258981],
       [ 0.33892351, -1.07534759,  1.0651123 , ..., -0.55848249,
        -0.25471575,  0.65258981],
       ...,
       [ 1.13237236, -0.07365375, -0.45260609, ..., -0.55848249,
        -0.25471575,  0.65258981],
       [-1.24797418, -0.95784594,  1.52702659, ..., -0.55848249,
        -0.25471575,  0.65258981],
       [-0.16220207, -0.04620586, -0.25464282, ..., -0.55848249,
        -0.25471575,  0.65258981]])

In [135]:
#### base model

mean_pred=np.repeat(y_train.mean(),len(y_valid))
r2_score(y_valid,mean_pred)

-0.012711622261606603

In [136]:
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    r2score=r2_score(true,predicted)
    return mae,mse,r2score

In [137]:
model={
    "Linear Regresion":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "DecisionTree":DecisionTreeRegressor(),
    "Random Forest":RandomForestRegressor(),
    "XGB Regressor":XGBRegressor(),
    "Catboot Rgressor":CatBoostRegressor(verbose=False),
    "AdaBoost Regressor":AdaBoostRegressor()
}

model_list=[]
r2_list=[]
for model_name,model in model.items():
    model.fit(X_train,y_train)

    y_train_pred=model.predict(X_train)
    y_valid_pred=model.predict(X_valid)

     # Evaluate Train and Test dataset
    model_train_mae, model_train_mse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae, model_test_mse, model_test_r2 = evaluate_model(y_valid, y_valid_pred)

    print(model_name)
    model_list.append(model_name)
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_mse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Validation set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_mse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('=' * 35)
    print('\n')

Linear Regresion
Model performance for Training set
- Root Mean Squared Error: 0.0061
- Mean Absolute Error: 0.0446
- R2 Score: 0.2142
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0036
- Mean Absolute Error: 0.0411
- R2 Score: 0.1833


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.0077
- Mean Absolute Error: 0.0496
- R2 Score: 0.0000
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0045
- Mean Absolute Error: 0.0439
- R2 Score: -0.0127


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.0061
- Mean Absolute Error: 0.0445
- R2 Score: 0.2143
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0036
- Mean Absolute Error: 0.0409
- R2 Score: 0.1854


KNeighborsRegressor
Model performance for Training set
- Root Mean Squared Error: 0.0042
- Mean Absolute Error: 0.0306
- R2 Score: 0.4567
-------

Random Forest
Model performance for Training set
- Root Mean Squared Error: 0.0005
- Mean Absolute Error: 0.0113
- R2 Score: 0.9298
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0025
- Mean Absolute Error: 0.0286
- R2 Score: 0.4288


XGB Regressor
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0033
- R2 Score: 0.9967
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0045
- Mean Absolute Error: 0.0324
- R2 Score: -0.0146


Catboot Rgressor
Model performance for Training set
- Root Mean Squared Error: 0.0003
- Mean Absolute Error: 0.0119
- R2 Score: 0.9557
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0024
- Mean Absolute Error: 0.0279
- R2 Score: 0.4636


AdaBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 0.0077
- Mean Absolute Error: 0.0783
- R2 Score:

Linear Regresion
Model performance for Training set
- Root Mean Squared Error: 0.0061
- Mean Absolute Error: 0.0446
- R2 Score: 0.2132
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0036
- Mean Absolute Error: 0.0407
- R2 Score: 0.1896
===================================


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.0077
- Mean Absolute Error: 0.0496
- R2 Score: 0.0000
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0045
- Mean Absolute Error: 0.0439
- R2 Score: -0.0127
===================================


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.0061
- Mean Absolute Error: 0.0446
- R2 Score: 0.2132
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0036
- Mean Absolute Error: 0.0407
- R2 Score: 0.1898
===================================


KNeighborsRegressor
Model performance for Training set
- Root Mean Squared Error: 0.0039
- Mean Absolute Error: 0.0287
- R2 Score: 0.4974
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0039
- Mean Absolute Error: 0.0337
- R2 Score: 0.1292
===================================


DecisionTree
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0085
- Mean Absolute Error: 0.0396
- R2 Score: -0.9220
===================================


Random Forest
Model performance for Training set
- Root Mean Squared Error: 0.0005
- Mean Absolute Error: 0.0112
- R2 Score: 0.9356
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0025
- Mean Absolute Error: 0.0284
- R2 Score: 0.4293
===================================


XGB Regressor
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0037
- R2 Score: 0.9955
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0038
- Mean Absolute Error: 0.0312
- R2 Score: 0.1486
===================================


Catboot Rgressor
Model performance for Training set
- Root Mean Squared Error: 0.0004
- Mean Absolute Error: 0.0122
- R2 Score: 0.9537
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0024
- Mean Absolute Error: 0.0276
- R2 Score: 0.4674
===================================


AdaBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 0.0070
- Mean Absolute Error: 0.0741
- R2 Score: 0.0906
----------------------------------
Model performance for Validation set
- Root Mean Squared Error: 0.0080
- Mean Absolute Error: 0.0784
- R2 Score: -0.8029
===================================

In [139]:
df_train

Unnamed: 0,subject_len,body_len,mean_paragraph_len,is_weekend,category,product,no_of_CTA,mean_CTA_len,is_image,is_quote,...,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7,times_of_day_1,times_of_day_2,times_of_day_3
0,76.0,10439,39.0,1,0.067375,0.060249,3.0,29,0,0,...,1,0,0,0,0,0,0,1,0,0
1,54.0,2570,68.5,1,0.079360,0.079360,0.0,22,0,0,...,1,0,0,0,0,0,0,0,1,0
2,59.0,12801,16.0,1,0.079360,0.079360,3.0,23,1,1,...,1,0,0,0,0,0,0,1,0,0
3,74.0,11037,30.0,0,0.023565,0.018442,4.0,24,0,0,...,0,1,0,0,0,0,0,0,0,1
4,80.0,10011,27.0,1,0.067375,0.060249,3.0,31,0,1,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1883,88.0,1451,68.5,0,0.079360,0.079360,0.0,22,0,1,...,0,0,0,1,0,0,0,1,0,0
1884,58.0,10537,40.0,0,0.079360,0.079360,5.0,27,0,0,...,0,0,0,1,0,0,0,0,0,1
1885,89.0,11050,26.0,0,0.023565,0.018442,4.0,28,0,0,...,0,0,0,0,1,0,0,0,0,1
1886,58.0,10537,40.0,0,0.079360,0.079360,5.0,27,0,0,...,0,0,0,0,1,0,0,0,0,1
