## IMPORTS

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import uniform, randint

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import make_scorer, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('src/conversion_data_train.csv')

## FEATURE ENGINEERING

In [3]:
data

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0
...,...,...,...,...,...,...
284575,US,36,1,Ads,1,0
284576,US,31,1,Seo,2,0
284577,US,41,1,Seo,5,0
284578,US,31,1,Direct,4,0


## PREPROCESSING

In [4]:
features_list = data.drop('converted', axis = 1).columns
numeric_indices = [1, 4]
categorical_indices = [0, 2, 3]
target_variable = 'converted'

In [5]:
X = data.loc[:, features_list]
Y = data.loc[:, target_variable]

print('Explanatory variables : ', X.columns)
print()

Explanatory variables :  Index(['country', 'age', 'new_user', 'source', 'total_pages_visited'], dtype='object')



In [6]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42, stratify = Y)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [7]:
# Put here all the preprocessings
print("Encoding categorical features and standardizing numerical features...")
numeric_transformer = StandardScaler()
numeric_features = X.iloc[:,numeric_indices].columns

categorical_transformer = OneHotEncoder(drop='first')
categorical_features = X.iloc[:,categorical_indices].columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train)

X_test = preprocessor.fit_transform(X_test)
print("...Done")
print(X_train[0:5,:])
print()
print(X_test[0:5,:])

Encoding categorical features and standardizing numerical features...


...Done
[[-1.27650481 -0.2618471   0.          0.          1.          1.
   0.          0.        ]
 [-0.18867057 -0.56090876  0.          0.          0.          1.
   0.          0.        ]
 [ 0.65742272 -0.56090876  0.          0.          1.          0.
   0.          1.        ]
 [-0.9138934   0.93439955  0.          0.          1.          1.
   0.          1.        ]
 [ 1.26177508 -0.56090876  0.          1.          0.          1.
   0.          0.        ]]

[[ 0.17264791 -0.25909297  0.          0.          1.          1.
   0.          1.        ]
 [-0.07005486  0.04077964  0.          1.          0.          1.
   0.          1.        ]
 [ 0.05129652  0.04077964  0.          0.          0.          1.
   0.          1.        ]
 [ 2.47832424  0.04077964  0.          0.          1.          0.
   0.          1.        ]
 [-1.28356872 -0.55896557  0.          0.          1.          1.
   1.          0.        ]]


## TRAINING MODEL

### XGBOOST

In [10]:
# Train model XGBoost

classifier = XGBClassifier()

f1_scorer = make_scorer(f1_score)

# params = {
#     'max_depth': [8, 9, 10],
#     'min_child_weight': [8, 9, 10],
#     'n_estimators': [20, 30, 40],
#     'reg_alpha': [0, 0.1, 0.2],
#     'reg_lambda': [2, 2.5, 3]
# }

# classifier = GridSearchCV(classifier, 
#                           param_grid = params, 
#                           cv = 5,
#                           scoring = f1_scorer,
#                           n_jobs = -1)

param_dist = {
    'learning_rate': uniform(0.01, 0.3),
    'n_estimators': randint(50, 500),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': uniform(0, 2),
    'reg_lambda': uniform(0, 2),
    'gamma': uniform(0, 0.2),
    'scale_pos_weight': uniform(1, 5),
    'max_delta_step': uniform(0, 5),
}

classifier = RandomizedSearchCV(estimator = classifier, 
                                param_distributions = param_dist, 
                                cv = 5,
                                scoring = f1_scorer,
                                n_iter = 50,
                                n_jobs = -1)


classifier.fit(X_train, Y_train)

print("...Done.")
print("Best hyperparameters : ", classifier.best_params_)
print("Best validation f1-score : ", classifier.best_score_)
print()

# Predictions on training set
Y_train_pred = classifier.predict(X_train)

# Predictions on test set
Y_test_pred = classifier.predict(X_test)

# F1-score
print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))
print()

# Confusion matrix
print("Confusion matrix on train set : ")
print(confusion_matrix(Y_train, Y_train_pred))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(Y_test, Y_test_pred))
print()

...Done.
Best hyperparameters :  {'colsample_bytree': 0.7718089712165844, 'gamma': 0.1917536675673416, 'learning_rate': 0.20634355033000254, 'max_delta_step': 1.6112364315351608, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 178, 'reg_alpha': 0.1697110029051221, 'reg_lambda': 0.14190563929250422, 'scale_pos_weight': 1.7249311828776523, 'subsample': 0.8066239464686527}
Best validation f1-score :  0.766656111913971

f1-score on train set :  0.7733074361820199
f1-score on test set :  0.7667766776677669

Confusion matrix on train set : 
[[218822   1498]
 [  1770   5574]]

Confusion matrix on test set : 
[[54674   406]
 [  442  1394]]



### SGD CLASSIFIER

In [20]:
# Train model SGDClassifier

classifier = SGDClassifier()

f1_scorer = make_scorer(f1_score)

# params = {
# }

# classifier = GridSearchCV(classifier, 
#                           param_grid = params, 
#                           cv = 5,
#                           scoring = f1_scorer,
#                           n_jobs = -1)

param_dist = {
    'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': np.logspace(-4, 4, 100),
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0': np.logspace(-4, 0, 100),
    'max_iter': [1000, 2000, 3000],
    'tol': np.logspace(-5, -1, 100),
}

classifier = RandomizedSearchCV(estimator = classifier, 
                                param_distributions = param_dist, 
                                cv = 5,
                                scoring = f1_scorer,
                                n_iter = 100,
                                random_state = 0,
                                n_jobs = -1)


classifier.fit(X_train, Y_train)

print("...Done.")
print("Best hyperparameters : ", classifier.best_params_)
print("Best validation f1-score : ", classifier.best_score_)
print()

# Predictions on training set
Y_train_pred = classifier.predict(X_train)

# Predictions on test set
Y_test_pred = classifier.predict(X_test)

# F1-score
print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))
print()

# Confusion matrix
print("Confusion matrix on train set : ")
print(confusion_matrix(Y_train, Y_train_pred))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(Y_test, Y_test_pred))
print()

...Done.
Best hyperparameters :  {'tol': 8.497534359086438e-05, 'penalty': 'l2', 'max_iter': 1000, 'loss': 'modified_huber', 'learning_rate': 'adaptive', 'eta0': 0.002848035868435802, 'alpha': 0.0005336699231206312}
Best validation f1-score :  0.7570075249045034

f1-score on train set :  0.7582806963487907
f1-score on test set :  0.7536054004295796

Confusion matrix on train set : 
[[219604    730]
 [  2408   4922]]

Confusion matrix on test set : 
[[54885   181]
 [  622  1228]]



### Bagging Logistic Regression

In [8]:
# Train model bagging logistic regression

logistic_regression = LogisticRegression(max_iter = 1000)
classifier = BaggingClassifier(logistic_regression)

f1_scorer = make_scorer(f1_score)

# params = {
#     'base_estimator__C': [10, 15], 
#     'n_estimators': [30, 40] 
# }

# classifier = GridSearchCV(classifier, 
#                           param_grid = params, 
#                           cv = 5,
#                           scoring = f1_scorer,
#                           n_jobs = -1)

param_dist = {
    'base_estimator__C': randint(1, 20),
    'n_estimators': randint(10, 100),
}

classifier = RandomizedSearchCV(estimator = classifier, 
                                param_distributions = param_dist, 
                                cv = 5,
                                scoring = f1_scorer,
                                n_iter = 10,
                                random_state = 42,
                                n_jobs = -1)

classifier.fit(X_train, Y_train)

print("...Done.")
print("Best hyperparameters : ", classifier.best_params_)
print("Best validation f1-score : ", classifier.best_score_)
print()

# Predictions on training set
Y_train_pred = classifier.predict(X_train)

# Predictions on test set
Y_test_pred = classifier.predict(X_test)

# F1-score
print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))
print()

# Confusion matrix
print("Confusion matrix on train set : ")
print(confusion_matrix(Y_train, Y_train_pred))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(Y_test, Y_test_pred))
print()

...Done.
Best hyperparameters :  {'base_estimator__C': 6, 'n_estimators': 11}
Best validation f1-score :  0.7646726428387098

f1-score on train set :  0.7650479571029379
f1-score on test set :  0.7588075880758809

Confusion matrix on train set : 
[[219488    846]
 [  2265   5065]]

Confusion matrix on test set : 
[[54855   211]
 [  590  1260]]



### AdaBoost Logistic Regression

In [9]:
# Train model bagging logistic regression

logistic_regression = LogisticRegression(max_iter = 1000)
classifier = AdaBoostClassifier(logistic_regression)

f1_scorer = make_scorer(f1_score)

# params = {
#     'base_estimator__C': [1, 5, 10], 
#     'n_estimators': [20, 30, 40] 
# }

# classifier = GridSearchCV(classifier, 
#                           param_grid = params, 
#                           cv = 5,
#                           scoring = f1_scorer,
#                           n_jobs = -1)

param_dist = {
    'base_estimator__C': randint(1, 20),
    'n_estimators': randint(10, 200),
}

classifier = RandomizedSearchCV(estimator = classifier, 
                                param_distributions = param_dist, 
                                cv = 5,
                                scoring = f1_scorer,
                                n_iter = 10,
                                random_state = 42,
                                n_jobs = -1)

classifier.fit(X_train, Y_train)

print("...Done.")
print("Best hyperparameters : ", classifier.best_params_)
print("Best validation f1-score : ", classifier.best_score_)
print()

# Predictions on training set
Y_train_pred = classifier.predict(X_train)

# Predictions on test set
Y_test_pred = classifier.predict(X_test)

# F1-score
print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))
print()

# Confusion matrix
print("Confusion matrix on train set : ")
print(confusion_matrix(Y_train, Y_train_pred))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(Y_test, Y_test_pred))
print()

...Done.
Best hyperparameters :  {'base_estimator__C': 7, 'n_estimators': 189}
Best validation f1-score :  0.7640180246934657

f1-score on train set :  0.7638668779714739
f1-score on test set :  0.7600721587492484

Confusion matrix on train set : 
[[219474    860]
 [  2269   5061]]

Confusion matrix on test set : 
[[54854   212]
 [  586  1264]]



### Gradient Boosting

In [10]:
# Train model RandomForest

classifier = GradientBoostingClassifier()

f1_scorer = make_scorer(f1_score)

# params = {
#     'max_depth': [10, 11, 12],
#     'min_samples_leaf': [9, 10, 11],
#     'min_samples_split': [10, 11, 12],
#     'n_estimators': [20, 30, 40]
# }

# classifier = GridSearchCV(classifier, 
#                           param_grid = params, 
#                           cv = 5,
#                           scoring = f1_scorer,
#                           n_jobs = -1)

param_dist = {
    'learning_rate': uniform(0.01, 0.3),
    'n_estimators': randint(10, 100),
    'max_depth': randint(3, 10),
    'min_samples_leaf': randint(1, 20),
    'min_samples_split': randint(2, 20),
}

classifier = RandomizedSearchCV(estimator = classifier, 
                                param_distributions = param_dist, 
                                cv = 5,
                                scoring = f1_scorer,
                                n_iter = 10,
                                random_state = 0,
                                n_jobs = -1)

classifier.fit(X_train, Y_train)

print("...Done.")
print("Best hyperparameters : ", classifier.best_params_)
print("Best validation f1-score : ", classifier.best_score_)
print()

# Predictions on training set
Y_train_pred = classifier.predict(X_train)

# Predictions on test set
Y_test_pred = classifier.predict(X_test)

# F1-score
print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))
print()

# Confusion matrix
print("Confusion matrix on train set : ")
print(confusion_matrix(Y_train, Y_train_pred))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(Y_test, Y_test_pred))
print()

...Done.
Best hyperparameters :  {'learning_rate': 0.27753190023462393, 'max_depth': 3, 'min_samples_leaf': 13, 'min_samples_split': 3, 'n_estimators': 49}
Best validation f1-score :  0.7634423993794718

f1-score on train set :  0.7651727004761545
f1-score on test set :  0.7566425120772946

Confusion matrix on train set : 
[[219495    839]
 [  2268   5062]]

Confusion matrix on test set : 
[[54857   209]
 [  597  1253]]



### RANDOM FOREST

In [13]:
# Train model RandomForest

classifier = RandomForestClassifier()

f1_scorer = make_scorer(f1_score)

# params = {
#     'max_depth': [10, 11, 12],
#     'min_samples_leaf': [9, 10, 11],
#     'min_samples_split': [10, 11, 12],
#     'n_estimators': [20, 30, 40]
# }

# classifier = GridSearchCV(classifier, 
#                           param_grid = params, 
#                           cv = 5,
#                           scoring = f1_scorer,
#                           n_jobs = -1)

param_dist = {
    'n_estimators': randint(10, 200),
    'max_depth': randint(3, 20),
    'min_samples_leaf': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False], 
    'criterion': ['gini', 'entropy'],
}

classifier = RandomizedSearchCV(estimator = classifier, 
                                param_distributions = param_dist, 
                                cv = 5,
                                scoring = f1_scorer,
                                n_iter = 10,
                                random_state = 0,
                                n_jobs = -1)

classifier.fit(X_train, Y_train)

print("...Done.")
print("Best hyperparameters : ", classifier.best_params_)
print("Best validation f1-score : ", classifier.best_score_)
print()

# Predictions on training set
Y_train_pred = classifier.predict(X_train)

# Predictions on test set
Y_test_pred = classifier.predict(X_test)

# F1-score
print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))
print()

# Confusion matrix
print("Confusion matrix on train set : ")
print(confusion_matrix(Y_train, Y_train_pred))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(Y_test, Y_test_pred))
print()

...Done.
Best hyperparameters :  {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 8, 'max_features': 'log2', 'min_samples_leaf': 9, 'min_samples_split': 19, 'n_estimators': 89}
Best validation f1-score :  0.7581701326685064

f1-score on train set :  0.76501325255585
f1-score on test set :  0.7543859649122807

Confusion matrix on train set : 
[[219510    824]
 [  2279   5051]]

Confusion matrix on test set : 
[[54857   209]
 [  603  1247]]



### SVC

In [10]:
# Train model SVC

classifier = SVC()

f1_scorer = make_scorer(f1_score)

# params = {
#     'C': [0.1, 1.0, 10],
#     'gamma': [0.1, 1.0, 10],
#     'kernel': ['linear']
# }

# classifier = GridSearchCV(classifier, 
#                           param_grid = params, 
#                           cv = 5,
#                           scoring = f1_scorer,
#                           n_jobs = -1)

param_dist = {
    'C': uniform(0.1, 10), 
    'kernel': ['linear'],
    'degree': randint(1, 10),
    'gamma': ['scale', 'auto', uniform(0.01, 1.0)],
    'coef0': uniform(-1, 1),
    'shrinking': [True, False],
}

classifier = RandomizedSearchCV(estimator = classifier, 
                                param_distributions = param_dist, 
                                cv = 5,
                                scoring = f1_scorer,
                                n_iter = 1,
                                random_state = 0,
                                n_jobs = -1)

classifier.fit(X_train, Y_train)

print("...Done.")
print("Best hyperparameters : ", classifier.best_params_)
print("Best validation f1-score : ", classifier.best_score_)
print()

# Predictions on training set
Y_train_pred = classifier.predict(X_train)

# Predictions on test set
Y_test_pred = classifier.predict(X_test)

# F1-score
print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))
print()

# Confusion matrix
print("Confusion matrix on train set : ")
print(confusion_matrix(Y_train, Y_train_pred))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(Y_test, Y_test_pred))
print()

...Done.
Best hyperparameters :  {'C': 5.588135039273247, 'coef0': -0.2848106336275805, 'degree': 4, 'gamma': 'auto', 'kernel': 'linear', 'shrinking': False}
Best validation f1-score :  0.7602562032982104

f1-score on train set :  0.7598686722150111
f1-score on test set :  0.7550151975683891

Confusion matrix on train set : 
[[219543    791]
 [  2354   4976]]

Confusion matrix on test set : 
[[54868   198]
 [  608  1242]]



## TRAIN ON ALL DATA

In [11]:
# # Concatenate our train and test set to train your best classifier on all data with labels
# X = np.append(X_train,X_test,axis=0)
# Y = np.append(Y_train,Y_test)

# classifier.best_estimator_.fit(X,Y)

In [12]:
# # Read data without labels
# data_without_labels = pd.read_csv('src/conversion_data_test.csv')
# print('Prediction set (without labels) :', data_without_labels.shape)

# # Warning : check consistency of features_list (must be the same than the features 
# # used by your best classifier)
# # features_list = ['total_pages_visited']
# X_without_labels = data_without_labels.loc[:, features_list]

# # Convert pandas DataFrames to numpy arrays before using scikit-learn
# # print("Convert pandas DataFrames to numpy arrays...")
# # X_without_labels = X_without_labels.values
# # print("...Done")

# # print(X_without_labels[0:5,:])

Prediction set (without labels) : (31620, 5)


In [13]:
# # WARNING : PUT HERE THE SAME PREPROCESSING AS FOR YOUR TEST SET
# # CHECK YOU ARE USING X_without_labels
# print("Encoding categorical features and standardizing numerical features...")

# X_without_labels = preprocessor.transform(X_without_labels)
# print("...Done")
# print(X_without_labels[0:5,:])

Encoding categorical features and standardizing numerical features...
...Done
[[-0.31275763  3.3393783   0.          1.          0.          0.
   0.          1.        ]
 [-1.04086595  0.04077964  0.          1.          0.          1.
   1.          0.        ]
 [ 0.17264791 -1.15871078  0.          0.          0.          1.
   0.          1.        ]
 [ 0.17264791  0.34065224  0.          0.          1.          1.
   0.          0.        ]
 [-0.67681179 -0.55896557  0.          0.          0.          0.
   0.          1.        ]]


In [14]:
# # Make predictions and dump to file
# # WARNING : MAKE SURE THE FILE IS A CSV WITH ONE COLUMN NAMED 'converted' AND NO INDEX !
# # WARNING : FILE NAME MUST HAVE FORMAT 'conversion_data_test_predictions_[name].csv'
# # where [name] is the name of your team/model separated by a '-'
# # For example : [name] = AURELIE-model1
# data = {
#     'converted': classifier.best_estimator_.predict(X_without_labels)
# }

# Y_predictions = pd.DataFrame(columns=['converted'],data=data)
# Y_predictions.to_csv('src/conversion_data_test_predictions_EXAMPLE.csv', index=False)