## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings('ignore')

In [59]:
df_train=pd.read_csv('data/train_F3fUq2S.csv')
df_test=pd.read_csv('data/test_Bk2wfZ3.csv')

In [26]:
df_train.head(5)

Unnamed: 0,campaign_id,sender,subject_len,body_len,mean_paragraph_len,day_of_week,is_weekend,times_of_day,category,product,...,is_image,is_personalised,is_quote,is_timer,is_emoticons,is_discount,is_price,is_urgency,target_audience,click_rate
0,1,3,76,10439,39,5,1,Noon,6,26,...,0,0,0,0,0,0,0,0,14,0.103079
1,2,3,54,2570,256,5,1,Morning,2,11,...,0,0,0,0,0,0,0,0,10,0.7
2,3,3,59,12801,16,5,1,Noon,2,11,...,1,0,1,0,0,0,0,0,16,0.002769
3,4,3,74,11037,30,4,0,Evening,15,9,...,0,0,0,0,0,0,0,0,10,0.010868
4,5,3,80,10011,27,5,1,Noon,6,26,...,0,0,1,0,0,0,0,0,14,0.142826


### Data Preprocessing

From the  data analysis made we came to know that some features are dominating in train dataset.
Also if we can check statistically those features having very correlation with target features.
Those features can ve dropeed from train and test data set.

In [60]:
df_train.drop(['is_personalised','is_timer','is_emoticons','is_discount','is_price','is_urgency','sender','campaign_id'], axis =1, inplace = True)
df_test.drop(['is_personalised','is_timer','is_emoticons','is_discount','is_price','is_urgency','sender','campaign_id'], axis =1, inplace = True)

In [61]:
def handle_outliers(df,feature):

    IQR=(df[feature].quantile(0.75))-(df[feature].quantile(0.25))
    lower_limit=df[feature].quantile(0.25)-(IQR * 1.5)
    upper_limit=df[feature].quantile(0.75)+(IQR * 1.5)

    df_train.loc[df[feature]>=upper_limit,feature]=upper_limit
    df_train.loc[df[feature]<=lower_limit,feature]=lower_limit

    return df[feature]

In [62]:
outlier_features=['no_of_CTA','mean_paragraph_len','body_len','subject_len']

for i in range(len(outlier_features)):
    handle_outliers(df_train,outlier_features[i])

In [63]:
import category_encoders as ce


def encoders(feature):
    
    encoder=ce.TargetEncoder(cols=feature) 
    encoder.fit(df_train[feature],df_train['click_rate'])

    df_train[feature]= encoder.transform(df_train[feature])
    df_test[feature]= encoder.transform(df_test[feature])


In [64]:
target_encoding_features=['product','category','target_audience']

for i in range(len(target_encoding_features)):
    encoders(target_encoding_features[i])


In [65]:
# encoding times_of_day feature using one hot encoding

encoder_=ce.OneHotEncoder(cols='times_of_day')
df_train= encoder_.fit_transform(df_train)
df_test= encoder_.fit_transform(df_test)


In [66]:
df_train

Unnamed: 0,subject_len,body_len,mean_paragraph_len,day_of_week,is_weekend,times_of_day_1,times_of_day_2,times_of_day_3,category,product,no_of_CTA,mean_CTA_len,is_image,is_quote,target_audience,click_rate
0,76.0,10439,39.0,5,1,1,0,0,0.067375,0.060249,3.0,29,0,0,0.091743,0.103079
1,54.0,2570,68.5,5,1,0,1,0,0.079360,0.079360,0.0,22,0,0,0.040149,0.700000
2,59.0,12801,16.0,5,1,1,0,0,0.079360,0.079360,3.0,23,1,1,0.006948,0.002769
3,74.0,11037,30.0,4,0,0,0,1,0.023565,0.018442,4.0,24,0,0,0.040149,0.010868
4,80.0,10011,27.0,5,1,1,0,0,0.067375,0.060249,3.0,31,0,1,0.091743,0.142826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1883,88.0,1451,68.5,2,0,1,0,0,0.079360,0.079360,0.0,22,0,1,0.040149,0.350746
1884,58.0,10537,40.0,2,0,0,0,1,0.079360,0.079360,5.0,27,0,0,0.030297,0.004728
1885,89.0,11050,26.0,1,0,0,0,1,0.023565,0.018442,4.0,28,0,0,0.045027,0.008289
1886,58.0,10537,40.0,1,0,0,0,1,0.079360,0.079360,5.0,27,0,0,0.006948,0.012014


#### Model Training-:


In [71]:
X=df_train.drop(['click_rate'] ,axis=1)
Y=df_train['click_rate']

In [72]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid=train_test_split(X,Y,random_state=42,test_size=0.2)

In [74]:
print('Train', X_train.shape, y_train.shape)
print('Validation', X_valid.shape, y_valid.shape)

Train (1510, 15) (1510,)
Validation (378, 15) (378,)


In [75]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

X_train=scaler.fit_transform(X_train)

X_valid=scaler.transform(X_valid)

In [79]:
X_train

array([[-0.6215672 ,  1.78139195, -0.98050813, ...,  0.08575301,
         1.10057071,  0.06745325],
       [-1.24797418,  0.29026212, -0.98050813, ...,  0.08575301,
        -0.80632748,  0.06745325],
       [ 0.33892351, -1.07534759,  1.0651123 , ..., -1.07038135,
         0.14712162,  0.06745325],
       ...,
       [ 1.13237236, -0.07365375, -0.45260609, ..., -1.07038135,
         2.0540198 ,  0.06745325],
       [-1.24797418, -0.95784594,  1.52702659, ..., -1.07038135,
        -0.80632748,  0.06745325],
       [-0.16220207, -0.04620586, -0.25464282, ...,  0.08575301,
        -0.80632748,  0.06745325]])

In [81]:
#### base model

mean_pred=np.repeat(y_train.mean(),len(y_valid))
r2_score(y_valid,mean_pred)

-0.012711622261606603

Linear regression

In [82]:
lr=LinearRegression()
lr.fit(X_train, y_train)

lr_pred=lr.predict(X_valid)

#check model performance
r2_score(y_valid,lr_pred)

0.1895841352556139

RandomForest

In [83]:
rf=RandomForestRegressor(random_state=10, n_jobs=-1, n_estimators=400)

rf.fit(X_train, y_train)

rf_pred=rf.predict(X_valid)

#check model performance
r2_score(y_valid,rf_pred)

0.43270144686578804