In [60]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score # confusion_matrix, 
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler     #, LabelEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import GridSearchCV



from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm          import LinearSVC, SVC
from xgboost              import XGBClassifier   




from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# from sklearn.neighbors import NearestCentroid
# from sklearn.impute import KNNImputer


# A essayer dans cet ordre : lazypredict
# https://lazypredict.readthedocs.io/en/latest/
# https://pub.towardsai.net/how-to-use-the-lazypredict-python-library-to-select-the-best-machine-learning-model-in-one-line-4c9e730058b

# J'ai pas les paramètres
# LinearSVC : https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
# Perceptron
# LogisiticRegression : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# XGBClassifier
# SVC                 : https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html


k_target = "converted"
k_samples_ratio = 10/100  # percentage of observation to be taken into account. Pass 100/100 for final testing 
k_test_size     = 20/100  # see train_test_split
k_random_state  = 42   # you know why...




In [61]:
# LinearSVC defaut
# f1 		     precision 	  recall
# 0.752855 	 0.858605 	  0.670298

# LinearSVC suite lecture https://pub.towardsai.net/how-to-use-the-lazypredict-python-library-to-select-the-best-machine-learning-model-in-one-line-4c9e730058b
# f1 		 precision 	 recall
# 0.753464 	 0.858403 	 0.671387






# Perceptron defaut
# f1 		      precision 	recall
# 0.649931 	  0.888539 	  0.512346


# Regression logistic near par defaut
# f1 		      precision 	recall
# 0.758731 	  0.848294 	  0.686275

# SVC defaut - 6 min
# f1 		 precision 	 recall
# 0.743723 	 0.867797 	 0.650690



# XGBClassifier 
# 27 min
# params = {
    # "max_depth"         : [1,2,5,10,20,50],  
    # "min_child_weight"  : [1,2,5,10,20,50],              # similar to min_samples_leaf and min_samples_split
    # "n_estimators"      : [10, 20, 50, 100, 200, 500],  
# }

# Best hyperparameters       :  {'max_depth': 2, 'min_child_weight': 1, 'n_estimators': 500}
# Accuracy on training set   :  0.9865616497495056
# Accuracy on test set :      0.9856396561013891

# f1 		 precision 	 recall
# 0.753815 	 0.843217 	 0.681554


In [62]:
df = pd.read_csv('./assets/conversion_data_train.csv')
df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


In [63]:

print(f"shape : {df.shape}\n")

print(df.info(), "\n")

print(f"# of null val :")
print(100 * df.isnull().sum() / df.shape[0])

# display(df.head())
# print(df.describe(include="all").T)
# print(df.duplicated().sum())
# print (df.isnull().any().any())

print()
print("Unique countries : ", df["country"].unique())
print("Unique sources   : ", df["source"].unique())
# print(df["col_name"].value_counts())
# print(df.isnull().sum().sort_values(ascending=False).head(11))
# df[k_target].value_counts()


shape : (284580, 6)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284580 entries, 0 to 284579
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   country              284580 non-null  object
 1   age                  284580 non-null  int64 
 2   new_user             284580 non-null  int64 
 3   source               284580 non-null  object
 4   total_pages_visited  284580 non-null  int64 
 5   converted            284580 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 13.0+ MB
None 

# of null val :
country                0.0
age                    0.0
new_user               0.0
source                 0.0
total_pages_visited    0.0
converted              0.0
dtype: float64

Unique countries :  ['China' 'UK' 'Germany' 'US']
Unique sources   :  ['Direct' 'Ads' 'Seo']


In [64]:
# df = df.sample(int(k_samples_ratio*len(df)))
df_nouveau = df.iloc[:int(k_samples_ratio*len(df))]

X = df.loc[:, df.columns != k_target]
y = df[k_target]

print("X :")
print(X.head())
print(X.shape)
print()

print("y :")
print(y.head())




X :
   country  age  new_user  source  total_pages_visited
0    China   22         1  Direct                    2
1       UK   21         1     Ads                    3
2  Germany   20         0     Seo                   14
3       US   23         1     Seo                    3
4       US   28         1  Direct                    3
(284580, 5)

y :
0    0
1    0
2    1
3    0
4    0
Name: converted, dtype: int64


In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=k_test_size, random_state=k_random_state, stratify = y)

In [66]:
numeric_features = X.select_dtypes(include="number").columns
print(numeric_features)

categorical_features = X.select_dtypes(exclude="number").columns
print(categorical_features)

numeric_transformer = Pipeline(
  steps=[
    ("imputer_num", SimpleImputer()),
    ("scaler_num", StandardScaler()),
  ]
)

categorical_transformer = Pipeline(
  steps=[
      ("imputer_cat", SimpleImputer(fill_value="missing", strategy="constant")),  
      # ("encoder_cat", OneHotEncoder(drop="first")),                 
      ("encoder_cat", OneHotEncoder(handle_unknown='ignore', sparse=False)),                 
    ]
  )

preprocessor = ColumnTransformer(
  transformers=[
      ("num", numeric_transformer,     numeric_features),
      ("cat", categorical_transformer, categorical_features),
    ]
  )




X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
X_train[0:5]


Index(['age', 'new_user', 'total_pages_visited'], dtype='object')
Index(['country', 'source'], dtype='object')




array([[-1.27650481,  0.6761303 , -0.2618471 ,  0.        ,  0.        ,
         0.        ,  1.        ,  1.        ,  0.        ,  0.        ],
       [-0.18867057,  0.6761303 , -0.56090876,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ],
       [ 0.65742272, -1.47900486, -0.56090876,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.        ],
       [-0.9138934 ,  0.6761303 ,  0.93439955,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.        ],
       [ 1.26177508,  0.6761303 , -0.56090876,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  0.        ]])

In [67]:

# clf = XGBClassifier()

# params = {
#     "max_depth"         : [1,2,5,10,20,50],  
#     "min_child_weight"  : [1,2,5,10,20,50],              # similar to min_samples_leaf and min_samples_split
#     "n_estimators"      : [10, 20, 50, 100, 200, 500],  
# }
# gridsearch = GridSearchCV(clf, param_grid=params, verbose = 0, n_jobs = -1, cv = 5) 

# gridsearch.fit(X_train, y_train)


# print("Best hyperparameters       : ", gridsearch.best_params_)
# # print("Best validation accuracy : ", gridsearch.best_score_)
# print("Accuracy on training set   : ", gridsearch.score(X_train, y_train))
# print("Accuracy on test set :     ", gridsearch.score(X_test, y_test))


# y_train_pred = gridsearch.predict(X_train)
# y_test_pred = gridsearch.predict(X_test)




# A essayer dans cet ordre
# J'ai pas les paramètres
# LinearSVC : https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
# Perceptron
# LogisiticRegression : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# XGBClassifier
# SVC                 : https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html


# clf = LinearSVC(random_state=42)
# clf = Perceptron() #(tol=1e-3, random_state=0)
# perceptron ????
clf = LogisticRegression() # 
# clf = SVC() # 
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)



In [68]:
print(f"f1 \t\t precision \t recall")
print(f"{f1_score(y_test,  y_test_pred):.6f} \t {precision_score(y_test,  y_test_pred):.6f} \t {recall_score(y_test,  y_test_pred):.6f}")

f1 		 precision 	 recall
0.768252 	 0.865529 	 0.690632


In [69]:
# print(f"f1 on train : {f1_score(y_train, y_train_pred):.3f}" )
# print(f"f1 on test  : {f1_score(y_test,  y_test_pred):.3f}")

In [70]:
# print(f"f1 on test  : {precision_score(y_test,  y_test_pred):.3f}")
# print(f"recall on test  : {recall_score(y_test,  y_test_pred):.3f}")

In [71]:
# print("train:")
# print(confusion_matrix(y_train, y_train_pred))
# print()

# print("test:")
# print(confusion_matrix(y_test, y_test_pred))


In [72]:
# Pipeline(steps=[('preprocessor',
#                  ColumnTransformer(transformers=[('numeric',
#                                                   Pipeline(steps=[('imputer',
#                                                                    SimpleImputer()),
#                                                                   ('scaler',
#                                                                    StandardScaler())]),
#                                                   Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], dtype='int64')),
#                                                  ('categorical_low',
#                                                   Pipeline(steps=[('imputer',
#                                                                    SimpleImputer(fill_value='missing',
#                                                                                  strategy='constant')),
#                                                                   ('encoding',
#                                                                    OneHotEncoder(handle_unknown='ignore',
#                                                                                  sparse=False))]),
#                                                   Index([], dtype='int64')),
#                                                  ('categorical_high',
#                                                   Pipeline(steps=[('imputer',
#                                                                    SimpleImputer(fill_value='missing',
#                                                                                  strategy='constant')),
#                                                                   ('encoding',
#                                                                    OrdinalEncoder())]),
#                                                   Index([], dtype='int64'))])),
#                 ('classifier', LinearSVC(random_state=42))])
