In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.ensemble import StackingClassifier
import pickle
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)
pio.templates.default = "jedha"
pio.renderers.default = "vscode" # to be replaced by "iframe" if working on JULIE
from IPython.display import display

In [2]:
data = pd.read_csv('data/conversion_data_train.csv')
print('Set with labels (our train+test) :', data.shape)

Set with labels (our train+test) : (284580, 6)


In [3]:
target_variable = 'converted'

X = data.drop(target_variable, axis=1)
Y = data.loc[:, target_variable]

print('Explanatory variables : ', X.columns)
print('Target variable :', target_variable)
print()

Explanatory variables :  Index(['country', 'age', 'new_user', 'source', 'total_pages_visited'], dtype='object')
Target variable : converted



In [4]:
# Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [5]:
numeric_features = ['age', 'total_pages_visited']
categorical_features = ['new_user', 'country', 'source']

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['age', 'total_pages_visited']
Found categorical features  ['new_user', 'country', 'source']


In [6]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()), 
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [7]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) 
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! 
print('...Done.')
print(X_test[0:5,:])

Performing preprocessings on train set...
       country  age  new_user  source  total_pages_visited
137434   China   19         1     Seo                    1
112323      US   33         1  Direct                    5
143261      US   51         1     Ads                    2
162328   China   17         0     Seo                    1
158039   China   28         1     Seo                    5
...Done.
[[-1.3990984  -1.15935344  1.          0.          0.          0.
   0.          1.        ]
 [ 0.29299544  0.03743241  1.          0.          0.          1.
   1.          0.        ]
 [ 2.46854467 -0.86015697  1.          0.          0.          1.
   0.          0.        ]
 [-1.64082609 -1.15935344  0.          0.          0.          0.
   0.          1.        ]
 [-0.31132378  0.03743241  1.          0.          0.          0.
   0.          1.        ]]

Performing preprocessings on test set...
       country  age  new_user  source  total_pages_visited
138303      UK   34         

In [8]:
def get_f1_score(model):
  Y_train_pred = model.predict(X_train)
  Y_test_pred = model.predict(X_test)

  # Here, the f1-score will be used to assess the performances on the leaderboard
  print(model.__class__.__name__)
  print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
  print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

#### import saved models

In [9]:
grid_logreg_v_22 = pickle.load(open('models/grid_logreg_v_22', 'rb'))
random_forest_opt_v_20 = pickle.load(open('models/random_forest_opt_v_20', 'rb'))
xgboost_opt_v_18 = pickle.load(open('models/xgboost_opt_v_18', 'rb'))
catboost_v_21 = pickle.load(open('models/catboost_v_21', 'rb'))
lgbm_v_23 = pickle.load(open('models/lgbm_v_23', 'rb'))

#### Stacking

In [43]:
stacking_classifier = StackingClassifier([('grid_logreg_v_22', grid_logreg_v_22), ('random_forest_opt_v_20', random_forest_opt_v_20), ('xgboost_opt_v_18', xgboost_opt_v_18), ('catboost_v_21',catboost_v_21), ('lgbm_v_23', lgbm_v_23)], final_estimator=LogisticRegression())

In [36]:
stacking_classifier = StackingClassifier([('random_forest_opt_v_20', random_forest_opt_v_20), ('xgboost_opt_v_18', xgboost_opt_v_18), ('catboost_v_21',catboost_v_21), ('lgbm_v_23', lgbm_v_23)], final_estimator=LogisticRegression())

In [44]:
stacking_classifier = StackingClassifier([('xgboost_opt_v_18', xgboost_opt_v_18), ('catboost_v_21',catboost_v_21), ('lgbm_v_23', lgbm_v_23)], final_estimator=LogisticRegression())

In [49]:
stacking_classifier = StackingClassifier([('xgboost_opt_v_18', xgboost_opt_v_18), ('catboost_v_21',catboost_v_21)], final_estimator=LogisticRegression())

In [54]:
stacking_classifier = StackingClassifier([('xgboost_opt_v_18', xgboost_opt_v_18), ('lgbm_v_23',lgbm_v_23)], final_estimator=LogisticRegression())

In [10]:
stacking_classifier = StackingClassifier([('xgboost_opt_v_18', xgboost_opt_v_18), ('random_forest_opt_v_20',random_forest_opt_v_20)], final_estimator=LogisticRegression())

In [11]:
stacking_classifier.fit(X_train, Y_train)

StackingClassifier(estimators=[('xgboost_opt_v_18',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              callbacks=None,
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None, gamma=0,
                                              gpu_id=-1,
                                              grow_policy='depthwise',
                                              importance_type=None,
                                              interaction_constraints='',
                                              learning_rate=0.30000...
                                              m

In [12]:
get_f1_score(stacking_classifier)

StackingClassifier
f1-score on train set :  0.7694404482471415
f1-score on test set :  0.744228432563791


### Submission

In [14]:
def retrain_and_submit(model, version=1):
  print(model)
  # Concatenate our train and test set to train your best classifier on all data with labels
  X = np.append(X_train,X_test,axis=0)
  Y = np.append(Y_train,Y_test)

  model.fit(X,Y)

  # Read data without labels
  data_without_labels = pd.read_csv('data/conversion_data_test.csv')
  print('Prediction set (without labels) :', data_without_labels.shape)

  # apply the preprocessing
  X_without_labels = preprocessor.transform(data_without_labels)
  data = {
    'converted': model.predict(X_without_labels)
  }

  Y_predictions = pd.DataFrame(columns=['converted'],data=data)
  Y_predictions.to_csv(f'submissions/conversion_data_test_predictions_Alexon_V{version}.csv', index=False)
  print('Done ....')

In [30]:
retrain_and_submit(stacking_classifier, version=24)
pickle.dump(stacking_classifier, open('models/stacking_classifier_v_24', 'wb'))

StackingClassifier(estimators=[('grid_logreg_v_22', LogisticRegression(C=3)),
                               ('random_forest_opt_v_20',
                                RandomForestClassifier(max_depth=1200,
                                                       min_samples_leaf=14,
                                                       min_samples_split=8,
                                                       n_estimators=125)),
                               ('xgboost_opt_v_18',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              callbacks=None,
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1,
                                              early_stoppi...
                                              max_delta_step=0, max_depth=14,
                                 

In [35]:
retrain_and_submit(stacking_classifier, version=25)
pickle.dump(stacking_classifier, open('models/stacking_classifier_v_25', 'wb'))

StackingClassifier(estimators=[('grid_logreg_v_22', LogisticRegression(C=3)),
                               ('random_forest_opt_v_20',
                                RandomForestClassifier(max_depth=1200,
                                                       min_samples_leaf=14,
                                                       min_samples_split=8,
                                                       n_estimators=125)),
                               ('xgboost_opt_v_18',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              callbacks=None,
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1,
                                              early_stoppi...
                                              max_delta_step=0, max_depth=14,
                                 

In [41]:
retrain_and_submit(stacking_classifier, version=26)
pickle.dump(stacking_classifier, open('models/stacking_classifier_v_26', 'wb'))

StackingClassifier(estimators=[('random_forest_opt_v_20',
                                RandomForestClassifier(max_depth=1200,
                                                       min_samples_leaf=14,
                                                       min_samples_split=8,
                                                       n_estimators=125)),
                               ('xgboost_opt_v_18',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              callbacks=None,
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_...
                                              max_

In [47]:
retrain_and_submit(stacking_classifier, version=27)
pickle.dump(stacking_classifier, open('models/stacking_classifier_v_27', 'wb'))

StackingClassifier(estimators=[('xgboost_opt_v_18',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              callbacks=None,
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None, gamma=0,
                                              gpu_id=-1,
                                              grow_policy='depthwise',
                                              importance_type=None,
                                              interaction_constraints='',
                                              learning_rate=0.30000...
                                              m

In [52]:
retrain_and_submit(stacking_classifier, version=28)
pickle.dump(stacking_classifier, open('models/stacking_classifier_v_28', 'wb'))

StackingClassifier(estimators=[('xgboost_opt_v_18',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              callbacks=None,
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None, gamma=0,
                                              gpu_id=-1,
                                              grow_policy='depthwise',
                                              importance_type=None,
                                              interaction_constraints='',
                                              learning_rate=0.30000...
                                              m

In [57]:
retrain_and_submit(stacking_classifier, version=29)
pickle.dump(stacking_classifier, open('models/stacking_classifier_v_29', 'wb'))

StackingClassifier(estimators=[('xgboost_opt_v_18',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              callbacks=None,
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None, gamma=0,
                                              gpu_id=-1,
                                              grow_policy='depthwise',
                                              importance_type=None,
                                              interaction_constraints='',
                                              learning_rate=0.300000012,
                                             

In [15]:
retrain_and_submit(stacking_classifier, version=31)
pickle.dump(stacking_classifier, open('models/stacking_classifier_v_31', 'wb'))

StackingClassifier(estimators=[('xgboost_opt_v_18',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              callbacks=None,
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None, gamma=0,
                                              gpu_id=-1,
                                              grow_policy='depthwise',
                                              importance_type=None,
                                              interaction_constraints='',
                                              learning_rate=0.30000...
                                              m