In [1]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.ensemble import StackingClassifier
import pickle
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)
pio.templates.default = "jedha"
pio.renderers.default = "vscode" # to be replaced by "iframe" if working on JULIE
from IPython.display import display

In [2]:
data = pd.read_csv('data/conversion_data_train.csv')
print('Set with labels (our train+test) :', data.shape)

Set with labels (our train+test) : (284580, 6)


In [3]:
target_variable = 'converted'

X = data.drop(target_variable, axis=1)
Y = data.loc[:, target_variable]

print('Explanatory variables : ', X.columns)
print('Target variable :', target_variable)
print()

Explanatory variables :  Index(['country', 'age', 'new_user', 'source', 'total_pages_visited'], dtype='object')
Target variable : converted



In [4]:
# Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [5]:
numeric_features = ['age', 'total_pages_visited']
categorical_features = ['new_user', 'country', 'source']

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['age', 'total_pages_visited']
Found categorical features  ['new_user', 'country', 'source']


In [6]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()), 
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [7]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) 
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! 
print('...Done.')
print(X_test[0:5,:])

Performing preprocessings on train set...
       country  age  new_user  source  total_pages_visited
137434   China   19         1     Seo                    1
112323      US   33         1  Direct                    5
143261      US   51         1     Ads                    2
162328   China   17         0     Seo                    1
158039   China   28         1     Seo                    5
...Done.
[[-1.3990984  -1.15935344  1.          0.          0.          0.
   0.          1.        ]
 [ 0.29299544  0.03743241  1.          0.          0.          1.
   1.          0.        ]
 [ 2.46854467 -0.86015697  1.          0.          0.          1.
   0.          0.        ]
 [-1.64082609 -1.15935344  0.          0.          0.          0.
   0.          1.        ]
 [-0.31132378  0.03743241  1.          0.          0.          0.
   0.          1.        ]]

Performing preprocessings on test set...
       country  age  new_user  source  total_pages_visited
138303      UK   34         

In [56]:
clf = CatBoostClassifier(
    iterations=50,
    random_seed=42,
    learning_rate=0.5,
    loss_function='Logloss',
    verbose=True,
)

0:	learn: 0.3328925	total: 13.9ms	remaining: 679ms
1:	learn: 0.1977779	total: 24.8ms	remaining: 596ms
2:	learn: 0.1299968	total: 34.7ms	remaining: 543ms
3:	learn: 0.0929800	total: 43.6ms	remaining: 501ms
4:	learn: 0.0720916	total: 49.9ms	remaining: 449ms
5:	learn: 0.0595848	total: 55.9ms	remaining: 410ms
6:	learn: 0.0518894	total: 62ms	remaining: 381ms
7:	learn: 0.0475659	total: 67.9ms	remaining: 357ms
8:	learn: 0.0447604	total: 73.7ms	remaining: 336ms
9:	learn: 0.0432637	total: 81.3ms	remaining: 325ms
10:	learn: 0.0422424	total: 89.7ms	remaining: 318ms
11:	learn: 0.0416564	total: 95.6ms	remaining: 303ms
12:	learn: 0.0413263	total: 101ms	remaining: 289ms
13:	learn: 0.0412523	total: 107ms	remaining: 275ms
14:	learn: 0.0411131	total: 118ms	remaining: 275ms
15:	learn: 0.0410626	total: 128ms	remaining: 273ms
16:	learn: 0.0409493	total: 138ms	remaining: 268ms
17:	learn: 0.0409305	total: 144ms	remaining: 256ms
18:	learn: 0.0408789	total: 150ms	remaining: 245ms
19:	learn: 0.0408535	total: 156

<catboost.core.CatBoostClassifier at 0x7f8c69bd9f40>

In [9]:
clf = CatBoostClassifier(
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=True,
)

In [10]:
clf.fit(X_train, Y_train)

0:	learn: 0.4731252	total: 70.9ms	remaining: 1m 10s
1:	learn: 0.3440819	total: 81.2ms	remaining: 40.5s
2:	learn: 0.2476283	total: 92.9ms	remaining: 30.9s
3:	learn: 0.1769926	total: 104ms	remaining: 25.9s
4:	learn: 0.1303245	total: 115ms	remaining: 22.9s
5:	learn: 0.1018797	total: 128ms	remaining: 21.2s
6:	learn: 0.0841670	total: 152ms	remaining: 21.5s
7:	learn: 0.0741186	total: 163ms	remaining: 20.2s
8:	learn: 0.0673791	total: 173ms	remaining: 19.1s
9:	learn: 0.0624869	total: 185ms	remaining: 18.3s
10:	learn: 0.0586489	total: 197ms	remaining: 17.7s
11:	learn: 0.0556407	total: 209ms	remaining: 17.2s
12:	learn: 0.0535759	total: 220ms	remaining: 16.7s
13:	learn: 0.0509006	total: 243ms	remaining: 17.1s
14:	learn: 0.0490034	total: 257ms	remaining: 16.9s
15:	learn: 0.0474436	total: 268ms	remaining: 16.5s
16:	learn: 0.0462869	total: 280ms	remaining: 16.2s
17:	learn: 0.0455765	total: 295ms	remaining: 16.1s
18:	learn: 0.0449842	total: 306ms	remaining: 15.8s
19:	learn: 0.0442731	total: 319ms	rem

<catboost.core.CatBoostClassifier at 0x7f9a68cad8b0>

In [11]:
def get_f1_score(model):
  Y_train_pred = model.predict(X_train)
  Y_test_pred = model.predict(X_test)

  # Here, the f1-score will be used to assess the performances on the leaderboard
  print(model.__class__.__name__)
  print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
  print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

In [12]:
get_f1_score(clf)

CatBoostClassifier
f1-score on train set :  0.7841284128412842
f1-score on test set :  0.7473053892215569


### Submissions

In [13]:
X = np.append(X_train,X_test,axis=0)
Y = np.append(Y_train,Y_test)

In [14]:
clf.fit(X, Y)

0:	learn: 0.4725738	total: 19.9ms	remaining: 19.9s
1:	learn: 0.3439859	total: 34.2ms	remaining: 17.1s
2:	learn: 0.2359434	total: 47.1ms	remaining: 15.7s
3:	learn: 0.1664424	total: 60.5ms	remaining: 15.1s
4:	learn: 0.1269544	total: 73.7ms	remaining: 14.7s
5:	learn: 0.1001209	total: 92.5ms	remaining: 15.3s
6:	learn: 0.0843424	total: 113ms	remaining: 16.1s
7:	learn: 0.0743304	total: 127ms	remaining: 15.7s
8:	learn: 0.0675003	total: 140ms	remaining: 15.4s
9:	learn: 0.0608943	total: 155ms	remaining: 15.3s
10:	learn: 0.0559631	total: 168ms	remaining: 15.1s
11:	learn: 0.0523154	total: 182ms	remaining: 15s
12:	learn: 0.0502727	total: 210ms	remaining: 15.9s
13:	learn: 0.0484603	total: 225ms	remaining: 15.9s
14:	learn: 0.0476349	total: 243ms	remaining: 15.9s
15:	learn: 0.0465052	total: 259ms	remaining: 16s
16:	learn: 0.0459196	total: 276ms	remaining: 16s
17:	learn: 0.0446388	total: 315ms	remaining: 17.2s
18:	learn: 0.0442208	total: 333ms	remaining: 17.2s
19:	learn: 0.0437916	total: 350ms	remaini

<catboost.core.CatBoostClassifier at 0x7f9a68cad8b0>

In [15]:
# Read data without labels
data_without_labels = pd.read_csv('data/conversion_data_test.csv')
print('Prediction set (without labels) :', data_without_labels.shape)

Prediction set (without labels) : (31620, 5)


In [16]:
# apply the preprocessing
X_without_labels = preprocessor.transform(data_without_labels)

data = {
    # 'converted': automl.predict(data_without_labels)
    'converted': clf.predict(X_without_labels)
}

Y_predictions = pd.DataFrame(columns=['converted'],data=data)
Y_predictions.to_csv('submissions/conversion_data_test_predictions_Alexon_V30.csv', index=False)
# V21, V30

In [64]:
pickle.dump(clf, open('models/catboost_v_21', 'wb'))

In [17]:
pickle.dump(clf, open('models/catboost_v_30', 'wb'))