In [1]:
!pip install plotly



In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)
pio.templates.default = "jedha"
pio.renderers.default = "svg" # to be replaced by "iframe" if working on JULIE
from IPython.display import display

In [3]:
data = pd.read_csv('Data/conversion_data_train.csv')
print('Set with labels (our train+test) :', data.shape)
data.head()

Set with labels (our train+test) : (284580, 6)


Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


# 1. Logistic Regression with 2 features
###### "total_pages_visited" & "new_user"

###### f1-score on train set :  0.6938517686692869
###### f1-score on test set :  0.7060240963855423

In [4]:
features_list1 = ['total_pages_visited', 'new_user']
target_variable = 'converted'

X1 = data.loc[:, features_list1]
Y1 = data.loc[:, target_variable]

print(X1.head())
print()

# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1, Y1, test_size=0.1, random_state=0)
print("...Done.")
print()

num_features1 = ["total_pages_visited"]
num_transformer1 = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Put here all the preprocessings
print("Encoding categorical features and standardizing numerical features...")

preprocessor = ColumnTransformer(transformers =[
    ("num_transformer", num_transformer1, num_features1)
])

X_train1 = preprocessor.fit_transform(X_train1)
X_train1[:5]

# Train model
print("Train model...")
classifier = LogisticRegression() # 
classifier.fit(X_train1, Y_train1)
print("...Done.")

# Predictions on training set
print("Predictions on training set...")
Y_train_pred1 = classifier.predict(X_train1)
print("...Done.")
print(Y_train_pred1)
print()

# Use X_test, and the same preprocessings as in training pipeline, 
# but call "transform()" instead of "fit_transform" methods (see example below)

print("Encoding categorical features and standardizing numerical features...")

X_test1 = preprocessor.transform(X_test1)
print("...Done")
print(X_test1[0:5,:])

# Predictions on test set
print("Predictions on test set...")
Y_test_pred1 = classifier.predict(X_test1)
print("...Done.")
print(Y_test_pred1)
print()

# WARNING : Use the same score as the one that will be used by Kaggle !
# Here, the f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(Y_train1, Y_train_pred1))
print("f1-score on test set : ", f1_score(Y_test1, Y_test_pred1))

   total_pages_visited  new_user
0                    2         1
1                    3         1
2                   14         0
3                    3         1
4                    3         1

Dividing into train and test sets...
...Done.

Encoding categorical features and standardizing numerical features...
Train model...
...Done.
Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Encoding categorical features and standardizing numerical features...
...Done
[[ 0.63778957]
 [ 0.03879562]
 [-0.26070136]
 [-0.26070136]
 [ 0.63778957]]
Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]

f1-score on train set :  0.6938517686692869
f1-score on test set :  0.7060240963855423


# 2. Logistic Regression with 2 features
###### "total_pages_visited" & "country"

###### f1-score on train set :  0.7180902442389955
###### f1-score on test set :  0.7223880597014924

In [5]:
features_list2 = ['total_pages_visited', 'country']
target_variable = 'converted'

X2 = data.loc[:, features_list2]
Y2 = data.loc[:, target_variable]

print(X2.head())
print()

# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, test_size=0.1, random_state=0)
print("...Done.")
print()

num_features2 = ["total_pages_visited"]
num_transformer2 = Pipeline(steps=[
    ('scaler', StandardScaler())
])

cat_features2 = ["country"]
cat_transformer2 = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop="first"))
])

# Put here all the preprocessings
print("Encoding categorical features and standardizing numerical features...")

preprocessor2 = ColumnTransformer(transformers =[
    ("num_transformer", num_transformer2, num_features2),
    ("cat_transformer", cat_transformer2, cat_features2)
])

X_train2 = preprocessor2.fit_transform(X_train2)
X_train2[0:5,:]

# Train model
print("Train model...")
classifier2 = LogisticRegression() # 
classifier2.fit(X_train2, Y_train2)
print("...Done.")

# Predictions on training set
print("Predictions on training set...")
Y_train_pred2 = classifier2.predict(X_train2)
print("...Done.")
print(Y_train_pred2)
print()

# Use X_test, and the same preprocessings as in training pipeline, 
# but call "transform()" instead of "fit_transform" methods (see example below)

print("Encoding categorical features and standardizing numerical features...")

X_test2 = preprocessor2.transform(X_test2)
print("...Done")
print(X_test2[0:5,:])

# Predictions on test set
print("Predictions on test set...")
Y_test_pred2 = classifier2.predict(X_test2)
print("...Done.")
print(Y_test_pred2)
print()

# WARNING : Use the same score as the one that will be used by Kaggle !
# Here, the f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(Y_train2, Y_train_pred2))
print("f1-score on test set : ", f1_score(Y_test2, Y_test_pred2))

   total_pages_visited  country
0                    2    China
1                    3       UK
2                   14  Germany
3                    3       US
4                    3       US

Dividing into train and test sets...
...Done.

Encoding categorical features and standardizing numerical features...
Train model...
...Done.
Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Encoding categorical features and standardizing numerical features...
...Done
[[ 0.63778957  0.          0.          1.        ]
 [ 0.03879562  0.          0.          1.        ]
 [-0.26070136  0.          0.          1.        ]
 [-0.26070136  1.          0.          0.        ]
 [ 0.63778957  0.          0.          1.        ]]
Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]

f1-score on train set :  0.7180902442389955
f1-score on test set :  0.7223880597014924


# 2. Logistic Regression with 2 features
###### "total_pages_visited" & "age"

###### f1-score on train set : 0.7049306088290676 
###### f1-score on test set : 0.7150635208711434 

In [8]:
features_list3 = ['total_pages_visited', 'age']
target_variable = 'converted'


X3 = data.loc[:, features_list3]
Y3 = data.loc[:, target_variable]

print('Explanatory variables : ', X3.columns)
print()

# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X3, Y3, test_size=0.1, random_state=0)
print("...Done.")
print()


num_features3 = ["total_pages_visited", "age"]
num_transformer3 = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Put here all the preprocessings
print("Encoding categorical features and standardizing numerical features...")

preprocessor3 = ColumnTransformer(transformers =[
    ("num_transformer", num_transformer3, num_features3)
])

X_train3 = preprocessor3.fit_transform(X_train3)
X_train3[:5]

# Train model
print("Train model...")
classifier3 = LogisticRegression() # 
classifier3.fit(X_train3, Y_train3)
print("...Done.")

# Predictions on training set
print("Predictions on training set...")
Y_train_pred3 = classifier3.predict(X_train3)
print("...Done.")
print(Y_train_pred3)
print()

# Use X_test, and the same preprocessings as in training pipeline, 
# but call "transform()" instead of "fit_transform" methods (see example below)

print("Encoding categorical features and standardizing numerical features...")

X_test3 = preprocessor3.transform(X_test3)
print("...Done")
print(X_test3[0:5,:])

# Predictions on test set
print("Predictions on test set...")
Y_test_pred3 = classifier3.predict(X_test3)
print("...Done.")
print(Y_test_pred3)
print()

# WARNING : Use the same score as the one that will be used by Kaggle !
# Here, the f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(Y_train3, Y_train_pred3))
print("f1-score on test set : ", f1_score(Y_test3, Y_test_pred3))

Explanatory variables :  Index(['total_pages_visited', 'age'], dtype='object')

Dividing into train and test sets...
...Done.

Encoding categorical features and standardizing numerical features...
Train model...
...Done.
Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Encoding categorical features and standardizing numerical features...
...Done
[[ 0.63778957 -1.27809213]
 [ 0.03879562  0.05264881]
 [-0.26070136 -0.31028053]
 [-0.26070136 -0.67320988]
 [ 0.63778957  1.62534265]]
Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]

f1-score on train set :  0.7049306088290676
f1-score on test set :  0.7150635208711434


# 4. Logistic Regression with 2 features
###### "total_pages_visited" & "source"

###### f1-score on train set : 0.6938517686692869
###### f1-score on test set : 0.7060240963855423 

In [9]:
features_list4 = ['total_pages_visited', 'source']
target_variable = 'converted'


X4 = data.loc[:, features_list4]
Y4 = data.loc[:, target_variable]

print('Explanatory variables : ', X4.columns)
print()

# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train4, X_test4, Y_train4, Y_test4 = train_test_split(X4, Y4, test_size=0.1, random_state=0)
print("...Done.")
print()


num_features4 = ["total_pages_visited"]
num_transformer4 = Pipeline(steps=[
    ('scaler', StandardScaler())
])

cat_features4 = ["source"]
cat_transformer4 = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop="first"))
])


# Put here all the preprocessings
print("Encoding categorical features and standardizing numerical features...")

preprocessor4 = ColumnTransformer(transformers =[
    ("num_transformer", num_transformer4, num_features4),
    ("cat_transformer", cat_transformer4, cat_features4)
])

X_train4 = preprocessor4.fit_transform(X_train4)
X_train4[:5]

# Train model
print("Train model...")
classifier4 = LogisticRegression() # 
classifier4.fit(X_train4, Y_train4)
print("...Done.")

# Predictions on training set
print("Predictions on training set...")
Y_train_pred4 = classifier4.predict(X_train4)
print("...Done.")
print(Y_train_pred4)
print()

# Use X_test, and the same preprocessings as in training pipeline, 
# but call "transform()" instead of "fit_transform" methods (see example below)

print("Encoding categorical features and standardizing numerical features...")

X_test4 = preprocessor4.transform(X_test4)
print("...Done")
print(X_test4[0:5,:])

# Predictions on test set
print("Predictions on test set...")
Y_test_pred4 = classifier4.predict(X_test4)
print("...Done.")
print(Y_test_pred4)
print()

# WARNING : Use the same score as the one that will be used by Kaggle !
# Here, the f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(Y_train4, Y_train_pred4))
print("f1-score on test set : ", f1_score(Y_test4, Y_test_pred4))

Explanatory variables :  Index(['total_pages_visited', 'source'], dtype='object')

Dividing into train and test sets...
...Done.

Encoding categorical features and standardizing numerical features...
Train model...
...Done.
Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Encoding categorical features and standardizing numerical features...
...Done
[[ 0.63778957  0.          0.        ]
 [ 0.03879562  0.          1.        ]
 [-0.26070136  0.          1.        ]
 [-0.26070136  0.          0.        ]
 [ 0.63778957  0.          0.        ]]
Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]

f1-score on train set :  0.6938517686692869
f1-score on test set :  0.7060240963855423


# 5. Decision Tree on all features

###### f1-score on train set : 0.7983466594389484
###### f1-score on test set : 0.7222562844880442  

In [13]:
# Separate target variable Y from features X
#print("Separating labels from features...")
#features_list = ["total_pages_visited"]
#target_variable = "converted"

X5 = data.iloc[:,:-1]
Y5 = data.iloc[:,-1:]

print("...Done.")
print()

print('Y : ')
print(Y5.head())
print()
print('X :')
print(X5.head())


# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train5, X_test5, Y_train5, Y_test5 = train_test_split(X5, Y5, test_size=0.1, random_state=0, stratify=Y5)
print("...Done.")
print()


num_features5 = ["age", "total_pages_visited"]
num_transformer5 = Pipeline(steps=[
    ('scaler', StandardScaler())
])

cat_features5 = ["country", "new_user", "source"]
cat_transformer5 = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop="first"))
])

# Put here all the preprocessings
print("Encoding categorical features and standardizing numerical features...")

preprocessor5 = ColumnTransformer(transformers =[
    ("num_transformer", num_transformer5, num_features5),
    ("cat_transformer", cat_transformer5, cat_features5)
])

X_train5 = preprocessor5.fit_transform(X_train5)
X_train5[:5]

# Train model
print("Train model...")
classifier5 = DecisionTreeClassifier() # 
classifier5.fit(X_train5, Y_train5)
print("...Done.")

# Predictions on training set
print("Predictions on training set...")
Y_train_pred5 = classifier5.predict(X_train5)
print("...Done.")
print(Y_train_pred5)
print()

# Use X_test, and the same preprocessings as in training pipeline, 
# but call "transform()" instead of "fit_transform" methods (see example below)

print("Encoding categorical features and standardizing numerical features...")

X_test5 = preprocessor5.transform(X_test5)
print("...Done")
print(X_test5[0:5,:])

# Predictions on test set
print("Predictions on test set...")
Y_test_pred5 = classifier5.predict(X_test5)
print("...Done.")
print(Y_test_pred5)
print()

# WARNING : Use the same score as the one that will be used by Kaggle !
# Here, the f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(Y_train5, Y_train_pred5))
print("f1-score on test set : ", f1_score(Y_test5, Y_test_pred5))

...Done.

Y : 
   converted
0          0
1          0
2          1
3          0
4          0

X :
   country  age  new_user  source  total_pages_visited
0    China   22         1  Direct                    2
1       UK   21         1     Ads                    3
2  Germany   20         0     Seo                   14
3       US   23         1     Seo                    3
4       US   28         1  Direct                    3
Dividing into train and test sets...
...Done.

Encoding categorical features and standardizing numerical features...
Train model...
...Done.
Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Encoding categorical features and standardizing numerical features...
...Done
[[-0.79365434  2.73051047  0.          0.          0.          1.
   0.          0.        ]
 [ 0.53653379  0.03808136  0.          0.          1.          0.
   1.          0.        ]
 [-0.18902337 -0.26107743  0.          0.          1.          0.
   1.          0.        ]
 [ 0.05282902  