# Challenge : predict conversions 🏆🏆

This is the template that shows the different steps of the challenge. In this notebook, all the training/predictions steps are implemented for a very basic model (logistic regression with only one variable). Please use this template and feel free to change the preprocessing/training steps to get the model with the best f1-score ! May the force be with you 🧨🧨  

**For a detailed description of this project, please refer to *02-Conversion_rate_challenge.ipynb*.**

# Import libraries

In [1]:
!pip install plotly



In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix,roc_curve
from sklearn.model_selection import cross_val_score, GridSearchCV

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)
pio.templates.default = "jedha"
pio.renderers.default = "iframe" # to be replaced by "iframe" if working on JULIE
from IPython.display import display

# Read file with labels

In [3]:
dataset = pd.read_csv('conversion_data_train.csv')
print('Set with labels (our train+test) :', dataset.shape)

Set with labels (our train+test) : (284580, 6)


In [4]:
dataset.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


In [5]:
desc = dataset.describe(include='all')
display(desc)
print(dataset.shape)

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
count,284580,284580.0,284580.0,284580,284580.0,284580.0
unique,4,,,3,,
top,US,,,Seo,,
freq,160124,,,139477,,
mean,,30.564203,0.685452,,4.873252,0.032258
std,,8.266789,0.464336,,3.341995,0.176685
min,,17.0,0.0,,1.0,0.0
25%,,24.0,0.0,,2.0,0.0
50%,,30.0,1.0,,4.0,0.0
75%,,36.0,1.0,,7.0,0.0


(284580, 6)


In [6]:
display(100*dataset.isnull().sum()/dataset.shape[0]) #no missing values

country                0.0
age                    0.0
new_user               0.0
source                 0.0
total_pages_visited    0.0
converted              0.0
dtype: float64

In [7]:
#Removing outlier from age
dataset = dataset.loc[dataset['age'] < 70,:]

dataset['new_user'].replace({1 : 'Yes', 0 : 'No'}, inplace = True)

# Explore dataset

In [8]:
# The dataset is quite big : you must create a sample of the dataset before making any visualizations !
#data_sample = data.sample(10000)

In [9]:
data=dataset.sample(1000)

In [10]:
#Répartition des utilisateurs par pays
plt.figure(figsize=(16,5))
plt.subplot(121) 
ax1 = sns.countplot(x='country', data=data, palette='mako')
ax1.set_title('Répartition des utilisateurs par pays', fontsize = 12,fontweight='bold' )
ax1.set_xlabel('Pays', fontsize = 11)
ax1.set_ylabel('Nb utilisateurs', fontsize = 11)
for rect in ax1.patches:
    ax1.text (rect.get_x() + rect.get_width()  / 2,rect.get_height()+ 5,rect.get_height(),horizontalalignment='center', fontsize = 10)
    
plt.subplot(122)
ax2 = sns.barplot(x='country', y='converted', data=data, palette='mako')
ax2.set_title('% des visiteurs convertis par pays', fontsize = 12,fontweight='bold')
ax2.set_xlabel('Pays', fontsize = 13)
ax2.set_ylabel('Convertion, %')

SyntaxError: invalid syntax (<ipython-input-10-db23614dda80>, line 7)

In [None]:
#Répartition des utilisateurs par source
plt.figure(figsize=(16,5))
plt.subplot(121) 
ax1 = sns.countplot(x='source', data=data, palette='Spectral')
ax1.set_title('Répartition des utilisateurs par source',fontsize = 12, fontweight='bold' )
ax1.set_xlabel('Source', fontsize = 11)
ax1.set_ylabel('Nb utilisateurs', fontsize = 11)
for rect in ax1.patches:
    ax1.text (rect.get_x() + rect.get_width()  / 2,rect.get_height()+ 5,rect.get_height(),horizontalalignment='center', fontsize = 10)
    
plt.subplot(122)
ax2 = sns.barplot(x="source", y='converted', data=data,  palette='Spectral')
ax2.set_title('% des visiteurs convertis par source ',fontsize = 11, fontweight='bold')
ax2.set_xlabel('Source', fontsize = 13)
ax2.set_ylabel('Convertion, %')
 

Visualisation des variables

In [None]:
fig, ax = plt.subplots(figsize=(16,5))
sns.distplot(data.age, ax=ax,norm_hist=True, bins=10)
plt.title("Distribution de l'âge des utilisateurs", fontsize=15, color='b')
plt.show()

In [None]:
plt.figure(figsize=(18,6))
ax = sns.countplot(x = data["total_pages_visited"], palette='Spectral');
ax.set_ylabel("Nb de personnes")
ax.set_title("Repartition des pages visitées")
ax.set_xlabel("Total de pages visitées", fontsize=13)
ax.set_xticklabels(ax.get_xticklabels(), 
                    fontsize=10, 
                    rotation=0)
for rect in ax.patches:
    ax.text (rect.get_x() + rect.get_width()  / 2,rect.get_height()+ 2,rect.get_height(),horizontalalignment='center', fontsize = 10)
plt.show();

In [None]:
# Correlation matrix
plt.figure(figsize= (8,5))
sns.heatmap(data.corr(), annot= True,);

# Make your model

## Choose variables to use in the model, and create train and test sets
**From the EDA, we know that the most useful feature is total_pages_visited. Let's create a baseline model by using at first only this feature : in the next cells, we'll make preprocessings and train a simple (univariate) logistic regression.**

In [None]:
target_variable = 'converted'
features_list = ['country','age','new_user','source','total_pages_visited']
numeric_indices = [1,4]
categorical_indices = [0,2,3]
target_variable = 'converted'

In [None]:
# Separate target variable Y from features X
X = data.loc[:, features_list]
Y = data[target_variable]

print('Explanatory variables : ', X.columns)
print()

In [None]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

In [None]:
# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.to_list()
Y_test = Y_test.to_list()
print("...Done")

print(X_train[0:5,:])
print(X_test[0:2,:])
print()
print(Y_train[0:5])
print(Y_test[0:2])

## Training pipeline

In [None]:
# Encoding categorical features and standardizing numerical features
print("Encoding categorical features and standardizing numerical features...")
print()
print(X_train[0:5,:])

# Normalization
numeric_transformer = StandardScaler()

# OHE / dummyfication
categorical_transformer = OneHotEncoder(drop='first')

featureencoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_indices),    
        ('num', numeric_transformer, numeric_indices)
        ]
    )

X_train = featureencoder.fit_transform(X_train)
X_test = featureencoder.transform(X_test)
print("...Done")



In [None]:
# Perform 3-fold cross-validation to evaluate the generalized R2 score 
print("3-fold cross-validation...")
regressor = LogisticRegression()
scores = cross_val_score(regressor, X_train, Y_train, cv=3)
print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

In [None]:
# Perform grid search
print("Grid search...")

# Grid of values to be tested
params = {
    'C': np.arange(1,3,0.1) # 0 corresponds to no regularization
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 3,scoring="f1") # cv : the number of folds (repetitions) to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
# Predictions on training set
# The model has already be re-trained on all the training set at the end of the grid search, so we can directly use it !
print("Predictions on training set...")
Y_train_pred = gridsearch.predict(X_train)
print(sklearn.metrics.classification_report(Y_train, gridsearch.predict(X_train)))

In [None]:
## Test pipeline

In [None]:
# Visualize ROC curves
probas_train = gridsearch.predict_proba(X_train)[:,1]
fpr, tpr, thresholds = roc_curve(Y_train, probas_train)
fig = go.Figure(
    data = go.Scatter(
        name = 'train',
        x = fpr, 
        y = tpr, 
        mode = 'lines'
    ),
    layout = go.Layout(
        title = go.layout.Title(text = "ROC curve", x = 0.5),
        xaxis = go.layout.XAxis(title = 'False Positive Rate'),
        yaxis = go.layout.YAxis(title = 'True Positive Rate')
    )
)

probas_test = gridsearch.predict_proba(X_test)[:,1]
precisions, recalls, thresholds = roc_curve(Y_test, probas_test)
fig.add_trace(go.Scatter(
    name = 'test',
    x = recalls, 
    y = precisions, 
    mode = 'lines'
    )
)
fig.show()

In [None]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))
print(sklearn.metrics.classification_report(Y_test, gridsearch.predict(X_test)))

**Our baseline model reaches a f1-score of almost 70%. Now, feel free to refine your model and try to beat this score ! 🚀🚀**

# Train best classifier on all data and use it to make predictions on X_without_labels
**Before making predictions on the file conversion_data_test.csv, let's train our model on ALL the data that was in conversion_data_train.csv. Sometimes, this allows to make tiny improvements in the score because we're using more examples to train the model.**

In [None]:
# Concatenate our train and test set to train your best classifier on all data with labels
X = np.append(X_train,X_test,axis=0)
Y = np.append(Y_train,Y_test)

gridsearch.fit(X,Y)

In [None]:
# Read data without labels
data_without_labels = pd.read_csv('conversion_data_test.csv')
print('Prediction set (without labels) :', data_without_labels.shape)

# Warning : check consistency of features_list (must be the same than the features 
# used by your best classifier)
features_list = ['country','age','new_user','source','total_pages_visited']
X_without_labels = data_without_labels.loc[:, features_list]

# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_without_labels = X_without_labels.values
print("...Done")

print(X_without_labels[0:5,:])

In [None]:
# WARNING : PUT HERE THE SAME PREPROCESSING AS FOR YOUR TEST SET
# CHECK YOU ARE USING X_without_labels
print("Encoding categorical features and standardizing numerical features...")

X_without_labels = featureencoder.transform(X_without_labels)
print("...Done")
print(X_without_labels[0:5,:])

In [None]:
# Make predictions and dump to file
# WARNING : MAKE SURE THE FILE IS A CSV WITH ONE COLUMN NAMED 'converted' AND NO INDEX !
# WARNING : FILE NAME MUST HAVE FORMAT 'conversion_data_test_predictions_[name].csv'
# where [name] is the name of your team/model separated by a '-'
# For example : [name] = AURELIE-model1
data = {
    'converted': gridsearch.predict(X_without_labels)
}

Y_predictions = pd.DataFrame(columns=['converted'],data=data)
Y_predictions.to_csv('conversion_data_test_predictions_EXAMPLE.csv', index=False)
