In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.io as pio
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
# import ensemble methods
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    confusion_matrix
) 

In [2]:
df = pd.read_csv('conversion_data_train.csv')
print('Set with labels (our train+test) :', df.shape)

Set with labels (our train+test) : (284580, 6)


In [3]:
df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


In [4]:
df.describe(include="all")

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
count,284580,284580.0,284580.0,284580,284580.0,284580.0
unique,4,,,3,,
top,US,,,Seo,,
freq,160124,,,139477,,
mean,,30.564203,0.685452,,4.873252,0.032258
std,,8.266789,0.464336,,3.341995,0.176685
min,,17.0,0.0,,1.0,0.0
25%,,24.0,0.0,,2.0,0.0
50%,,30.0,1.0,,4.0,0.0
75%,,36.0,1.0,,7.0,0.0


In [5]:
data = df.sample(frac=0.2, random_state=42)

In [6]:
# Convertir les colonnes catégorielles en variables numériques
data['country'] = data['country'].astype('category').cat.codes
data['new_user'] = data['new_user'].astype('category').cat.codes
data['source'] = data['source'].astype('category').cat.codes
data['converted'] = data['converted'].astype('category').cat.codes

# Calculer la matrice de corrélation
correlation_matrix = data.corr()

# Convertir la matrice de corrélation en DataFrame pour Plotly
correlation_df = correlation_matrix.reset_index().melt(id_vars='index')

# Renommer les colonnes pour Plotly
correlation_df.columns = ['Feature1', 'Feature2', 'Correlation']

# Créer la heatmap avec Plotly Express
fig = px.imshow(
    correlation_matrix,
    labels=dict(x="Features", y="Features", color="Correlation"),
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    color_continuous_scale=px.colors.diverging.RdBu,
    zmin=-1, zmax=1
)

fig.update_layout(
    title="Matrice de Corrélation",
    autosize=False,
    width=800,
    height=600,
    margin=dict(l=50, r=50, b=100, t=100),
)

fig.show()

On peut voir grâce à cette matrice de confusion que les features les plus correlés avec la colonne "converted" sont :
* total_pages_visited
* new_user
* age

In [7]:
# Graphique box plot pour converted vs total_pages_visited
fig4 = px.box(data, x='converted', y='total_pages_visited', 
              title="Relation entre 'converted' et 'total_pages_visited'", 
              labels={'total_pages_visited': 'Total Pages Visited', 'converted': 'Converted'})
fig4.show()


On remarque que plus le nombre de page visitées est elevés, plus la possibilité de conversion est élevés. 

In [8]:
fig = px.pie(data, values="converted", 
             names="new_user", 
             title="répartition des conversions si c'est un nouvel utilisateur")
fig.show()

In [9]:
# Graphique box plot pour converted vs age
fig6 = px.box(data, x='converted', y='age', 
              title="Relation entre 'converted' et 'age'", 
              labels={'age': 'Age', 'converted': 'Converted'})
fig6.show()


# MODEL SELECTION

In [10]:
features_list = ['age', 'new_user', 'total_pages_visited']
numeric_indices = [0, 1, 2]
target_variable = 'converted'

In [11]:
X = data.loc[:, features_list]
y = data.loc[:, target_variable]

print('Explanatory variables : ', X.columns)
print()

Explanatory variables :  Index(['age', 'new_user', 'total_pages_visited'], dtype='object')



In [12]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [13]:
# Create pipeline for numeric features
numeric_features = ['age', 'new_user', 'total_pages_visited']  # Names of numeric columns in X_train/X_test
numeric_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="mean"),
        ),  # missing values will be replaced by columns' median
        ("scaler", StandardScaler()),
    ]
)

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features)
    ]
)

In [14]:
X_train_df = pd.DataFrame(X_train)

# Convertir X_test en DataFrame Pandas
X_test_df = pd.DataFrame(X_test)

X_train = preprocessor.fit_transform(X_train_df)
X_test = preprocessor.transform(X_test_df) 

In [15]:
scores_df = pd.DataFrame(columns=['model', 'accuracy', 'set', 'f1_score'])

In [16]:
# Perform grid search
print("Grid search...")
logistic_regression = LogisticRegression(max_iter=1000)  # max_iter changed because of convergence warning
model = BaggingClassifier(estimator=logistic_regression)

# Grid of values to be tested
params = {
    "estimator__C": [ 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0],  # base_estimator__ prefix because C is a parameter from LogisticRegression!
    "n_estimators": [ 5, 10, 20, 40, 60, 80, 100],  # n_estimators is a hyperparameter of the ensemble method
}
print(params)
gridsearch = GridSearchCV(model, param_grid=params, cv=3)  # cv : the number of folds to be used for CV
gridsearch.fit(X_train, y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)
print()
print("Accuracy on training set : ", gridsearch.score(X_train, y_train))
print("Accuracy on test set : ", gridsearch.score(X_test, y_test))

Grid search...
{'estimator__C': [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0], 'n_estimators': [5, 10, 20, 40, 60, 80, 100]}
...Done.
Best hyperparameters :  {'estimator__C': 0.1, 'n_estimators': 5}
Best validation accuracy :  0.9850873917434164

Accuracy on training set :  0.984933673021172
Accuracy on test set :  0.9851546029515109


In [17]:
# Predictions on training set
print("Predictions on training set...")
y_train_pred_bag = gridsearch.predict(X_train)
print("...Done.")
print(y_train_pred_bag)
print()

# Predictions on test set
print("Predictions on test set...")
y_test_pred_bag = gridsearch.predict(X_test)
print("...Done.")
print(y_test_pred_bag)
print()

# WARNING : Use the same score as the one that will be used by Kaggle !
# Here, the f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(y_train, y_train_pred_bag))
print("f1-score on test set : ", f1_score(y_test, y_test_pred_bag))

train_scores_df = pd.DataFrame({'model': ['bagging_class'], 'accuracy': [gridsearch.score(X_train, y_train)], 'set': ['train'], 'f1_score': [f1_score(y_train, y_train_pred_bag)]})
test_scores_df = pd.DataFrame({'model': ['bagging_class'], 'accuracy': [gridsearch.score(X_test, y_test)], 'set': ['test'], 'f1_score': [f1_score(y_test, y_test_pred_bag)]})
scores_df = pd.concat([scores_df, train_scores_df, test_scores_df], ignore_index=True)

scores_df


Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]

f1-score on train set :  0.7419112114371708
f1-score on test set :  0.7466266866566716


Unnamed: 0,model,accuracy,set,f1_score
0,bagging_class,0.984934,train,0.741911
1,bagging_class,0.985155,test,0.746627


In [18]:
print("Grid search...")
classifier = RandomForestClassifier()

# Grid of values to be tested
params = {
    "max_depth": [2, 4, 6, 8, 10],
    "min_samples_leaf": [1, 2, 5],
    "min_samples_split": [2, 4, 8],
    "n_estimators": [10, 20, 40, 60, 80, 100],
}
gridsearch = GridSearchCV(
    classifier, param_grid=params, cv=3
)  # cv : the number of folds to be used for CV
gridsearch.fit(X_train, y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)
print("Accuracy on training set : ", gridsearch.score(X_train, y_train))
print("Accuracy on test set : ", gridsearch.score(X_test, y_test))

Grid search...
...Done.
Best hyperparameters :  {'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 10}
Best validation accuracy :  0.9844724492681213
Accuracy on training set :  0.9850434859000263
Accuracy on test set :  0.9843640196767393


In [19]:
# Predictions on training set
print("Predictions on training set...")
y_train_pred_random = gridsearch.predict(X_train)
print("...Done.")
print(y_train_pred_random)
print()

# Predictions on test set
print("Predictions on test set...")
y_test_pred_random = gridsearch.predict(X_test)
print("...Done.")
print(y_test_pred_random)
print()

# WARNING : Use the same score as the one that will be used by Kaggle !
# Here, the f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(y_train, y_train_pred_random))
print("f1-score on test set : ", f1_score(y_test, y_test_pred_random))

train_scores_df = pd.DataFrame({'model': ['random_forest'], 'accuracy': [gridsearch.score(X_train, y_train)], 'set': ['train'], 'f1_score': [f1_score(y_train, y_train_pred_random)]})
test_scores_df = pd.DataFrame({'model': ['random_forest'], 'accuracy': [gridsearch.score(X_test, y_test)], 'set': ['test'], 'f1_score': [f1_score(y_test, y_test_pred_random)]})
scores_df = pd.concat([scores_df, train_scores_df, test_scores_df], ignore_index=True)

scores_df

Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]

f1-score on train set :  0.7442733758918513
f1-score on test set :  0.7327327327327328


Unnamed: 0,model,accuracy,set,f1_score
0,bagging_class,0.984934,train,0.741911
1,bagging_class,0.985155,test,0.746627
2,random_forest,0.985043,train,0.744273
3,random_forest,0.984364,test,0.732733


In [20]:
# Perform grid search
print("Grid search...")
logistic_regression = LogisticRegression(max_iter=1000)  # max_iter changed because of convergence warning
model = AdaBoostClassifier(estimator=logistic_regression)

# Grid of values to be tested
params = {
    "estimator__C": [ 4.0, 4.5, 5.0, 5.5, 6.0, 6.5],  # base_estimator__ prefix because C is a parameter from LogisticRegression!
    "n_estimators": [ 45, 50, 55, 60, 65, 70, 75],  # n_estimators is a hyperparameter of the ensemble method
}
print(params)
gridsearch = GridSearchCV(model, param_grid=params, cv=3)  # cv : the number of folds to be used for CV
gridsearch.fit(X_train, y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)
print()
print("Accuracy on training set : ", gridsearch.score(X_train, y_train))
print("Accuracy on test set : ", gridsearch.score(X_test, y_test))

Grid search...
{'estimator__C': [4.0, 4.5, 5.0, 5.5, 6.0, 6.5], 'n_estimators': [45, 50, 55, 60, 65, 70, 75]}
...Done.
Best hyperparameters :  {'estimator__C': 4.0, 'n_estimators': 45}
Best validation accuracy :  0.9848677727383137

Accuracy on training set :  0.984933673021172
Accuracy on test set :  0.9853302881236824


In [21]:
# Predictions on training set
print("Predictions on training set...")
y_train_pred_adaboost = gridsearch.predict(X_train)
print("...Done.")
print(y_train_pred_adaboost)
print()

# Predictions on test set
print("Predictions on test set...")
y_test_pred_adaboost = gridsearch.predict(X_test)
print("...Done.")
print(y_test_pred_adaboost)
print()

# WARNING : Use the same score as the one that will be used by Kaggle !
# Here, the f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(y_train, y_train_pred_adaboost))
print("f1-score on test set : ", f1_score(y_test, y_test_pred_adaboost))

train_scores_df = pd.DataFrame({'model': ['adaboost_baglog'], 'accuracy': [gridsearch.score(X_train, y_train)], 'set': ['train'], 'f1_score': [f1_score(y_train, y_train_pred_adaboost)]})
test_scores_df = pd.DataFrame({'model': ['adaboost_baglog'], 'accuracy': [gridsearch.score(X_test, y_test)], 'set': ['test'], 'f1_score': [f1_score(y_test, y_test_pred_adaboost)]})
scores_df = pd.concat([scores_df, train_scores_df, test_scores_df], ignore_index=True)

scores_df

Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]

f1-score on train set :  0.7436472346786248
f1-score on test set :  0.7518573551263001


Unnamed: 0,model,accuracy,set,f1_score
0,bagging_class,0.984934,train,0.741911
1,bagging_class,0.985155,test,0.746627
2,random_forest,0.985043,train,0.744273
3,random_forest,0.984364,test,0.732733
4,adaboost_baglog,0.984934,train,0.743647
5,adaboost_baglog,0.98533,test,0.751857


In [22]:
print("Grid search...")
model = GradientBoostingClassifier()

# Grid of values to be tested
params = {
    "max_depth": [8, 10, 12],  # no base_estimator_ prefix because these are all arguments of GradientBoostingClassifier
    "min_samples_leaf": [1, 2, 3],
    "min_samples_split": [6, 8, 10],
    "n_estimators": [2, 4, 6, 8, 10, 12],
}
print(params)
gridsearch = GridSearchCV(
    model, param_grid=params, cv=3
)  # cv : the number of folds to be used for CV
gridsearch.fit(X_train, y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)
print()
print("Accuracy on training set : ", gridsearch.score(X_train, y_train))
print("Accuracy on test set : ", gridsearch.score(X_test, y_test))

Grid search...
{'max_depth': [8, 10, 12], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [6, 8, 10], 'n_estimators': [2, 4, 6, 8, 10, 12]}
...Done.
Best hyperparameters :  {'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 12}
Best validation accuracy :  0.9834841492748346

Accuracy on training set :  0.9859878766581744
Accuracy on test set :  0.9835734364019677


In [23]:
# Predictions on training set
print("Predictions on training set...")
y_train_pred_gradient = gridsearch.predict(X_train)
print("...Done.")
print(y_train_pred_gradient)
print()

# Predictions on test set
print("Predictions on test set...")
y_test_pred_gradient = gridsearch.predict(X_test)
print("...Done.")
print(y_test_pred_gradient)
print()

# WARNING : Use the same score as the one that will be used by Kaggle !
# Here, the f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(y_train, y_train_pred_gradient))
print("f1-score on test set : ", f1_score(y_test, y_test_pred_gradient))

train_scores_df = pd.DataFrame({'model': ['gradient_boost'], 'accuracy': [gridsearch.score(X_train, y_train)], 'set': ['train'], 'f1_score': [f1_score(y_train, y_train_pred_gradient)]})
test_scores_df = pd.DataFrame({'model': ['gradient_boost'], 'accuracy': [gridsearch.score(X_test, y_test)], 'set': ['test'], 'f1_score': [f1_score(y_test, y_test_pred_gradient)]})
scores_df = pd.concat([scores_df, train_scores_df, test_scores_df], ignore_index=True)

scores_df


Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]

f1-score on train set :  0.747026169706582
f1-score on test set :  0.7045813586097945


Unnamed: 0,model,accuracy,set,f1_score
0,bagging_class,0.984934,train,0.741911
1,bagging_class,0.985155,test,0.746627
2,random_forest,0.985043,train,0.744273
3,random_forest,0.984364,test,0.732733
4,adaboost_baglog,0.984934,train,0.743647
5,adaboost_baglog,0.98533,test,0.751857
6,gradient_boost,0.985988,train,0.747026
7,gradient_boost,0.983573,test,0.704581


In [24]:
# You can also check more performance metrics to better understand what your model is doing
print("Confusion matrix on train set : ")
print(confusion_matrix(y_train, y_train_pred_adaboost))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(y_test, y_test_pred_adaboost))
print()

Confusion matrix on train set : 
[[43851   195]
 [  491   995]]

Confusion matrix on test set : 
[[10964    48]
 [  119   253]]



In [28]:
# Calculer les matrices de confusion
cm_train = confusion_matrix(y_train, y_train_pred_adaboost)
cm_test = confusion_matrix(y_test, y_test_pred_adaboost)

# Fonction pour créer une heatmap avec Plotly
def plot_confusion_matrix(cm, title):
    fig = go.Figure(data=go.Heatmap(
        z=cm,
        x=['Predicted Negative', 'Predicted Positive'],
        y=['Actual Negative', 'Actual Positive'],
        colorscale='Viridis',
        showscale=True
    ))

    # Ajouter des annotations
    annotations = []
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            annotations.append(
                go.layout.Annotation(
                    text=str(cm[i][j]),
                    x=j,
                    y=i,
                    xref='x1',
                    yref='y1',
                    showarrow=False,
                    font=dict(color="white" if cm[i][j] > cm.max() / 2 else "black")
                )
            )

    fig.update_layout(
        title=title,
        xaxis=dict(title='Predicted label'),
        yaxis=dict(title='True label'),
        autosize=False,
        width=500,
        height=500,
        annotations=annotations
    )

    fig.show()

# Visualiser les matrices de confusion
plot_confusion_matrix(cm_train, "Confusion Matrix on Train set")
plot_confusion_matrix(cm_test, "Confusion Matrix on Test set")

In [29]:
# Concatenate our train and test set to train your best classifier on all data with labels
X = np.append(X_train,X_test,axis=0)
y = np.append(y_train,y_test)

classifier.fit(X,y)

In [30]:
# Read data without labels
data_without_labels = pd.read_csv('conversion_data_test.csv')
print('Prediction set (without labels) :', data_without_labels.shape)

# Warning : check consistency of features_list (must be the same than the features 
# used by your best classifier)
features_list = ['age', 'new_user', 'total_pages_visited']
X_without_labels = data_without_labels.loc[:, features_list]

# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_without_labels = X_without_labels.values
print("...Done")

print(X_without_labels[0:5,:])

Prediction set (without labels) : (31620, 5)
Convert pandas DataFrames to numpy arrays...
...Done
[[28  0 16]
 [22  1  5]
 [32  1  1]
 [32  1  6]
 [25  0  3]]


In [31]:
# Sanity check (no target)
data_without_labels.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited
0,UK,28,0,Seo,16
1,UK,22,1,Direct,5
2,China,32,1,Seo,1
3,US,32,1,Ads,6
4,China,25,0,Seo,3


In [32]:
# Create pipeline for numeric features
numeric_features = [0, 1, 2]  # Names of numeric columns in X_train/X_test
numeric_transformer = Pipeline(
    steps=[  # missing values will be replaced by columns' median
        ("scaler", StandardScaler()),
    ]
)

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features)
    ]
)

# WARNING : PUT HERE THE SAME PREPROCESSING AS FOR YOUR TEST SET
# CHECK YOU ARE USING X_without_labels
print("Encoding categorical features and standardizing numerical features...")
preprocessor.fit(X_without_labels)

# Transform the data without labels
X_without_labels_preprocessed = preprocessor.transform(X_without_labels)

# Print the preprocessed data
print(X_without_labels_preprocessed)


Encoding categorical features and standardizing numerical features...
[[-0.31512217 -1.47663353  3.33913917]
 [-1.03657046  0.6772161   0.03888347]
 [ 0.16584336  0.6772161  -1.16120951]
 ...
 [ 0.28608475  0.6772161   0.03888347]
 [-0.67584631  0.6772161   2.73909268]
 [-1.03657046  0.6772161  -0.86118627]]


In [34]:
# Make predictions and dump to file
# WARNING : MAKE SURE THE FILE IS A CSV WITH ONE COLUMN NAMED 'converted' AND NO INDEX !
# WARNING : FILE NAME MUST HAVE FORMAT 'conversion_data_test_predictions_[name].csv'
# where [name] is the name of your team/model separated by a '-'
# For example : [name] = AURELIE-model1
data = {
    'converted': gridsearch.predict(X_without_labels_preprocessed)
}

Y_predictions = pd.DataFrame(columns=['converted'],data=data)
Y_predictions.to_csv('conversion_data_test_predictions_Clement-baglogreg_final.csv', index=False)
