In [373]:
import pandas as pd
import numpy as np
from datetime import datetime


from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier 
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures   
from sklearn.feature_selection import SelectKBest, f_classif, chi2


k_result_file   = "results"
k_target        = "converted"
k_samples_ratio = 10/100  # percentage of observation to be taken into account. Pass 100/100 for final testing 
k_test_size     = 20/100  # see train_test_split
k_random_state  = 42      # you know why...
k_header        = "conversion_data_test_predictions_"
k_author        = "PHILIPPE"


## Get the data

In [374]:
df = pd.read_csv('./assets/conversion_data_train.csv')
df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


## Quick'n dirty EDA

In [375]:
print(f"shape : {df.shape}\n")
print()

display(df.describe(include="all").T)


shape : (284580, 6)




Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
country,284580.0,4.0,US,160124.0,,,,,,,
age,284580.0,,,,30.564203,8.266789,17.0,24.0,30.0,36.0,123.0
new_user,284580.0,,,,0.685452,0.464336,0.0,0.0,1.0,1.0,1.0
source,284580.0,3.0,Seo,139477.0,,,,,,,
total_pages_visited,284580.0,,,,4.873252,3.341995,1.0,2.0,4.0,7.0,29.0
converted,284580.0,,,,0.032258,0.176685,0.0,0.0,0.0,0.0,1.0


In [376]:
print(df.info(), "\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284580 entries, 0 to 284579
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   country              284580 non-null  object
 1   age                  284580 non-null  int64 
 2   new_user             284580 non-null  int64 
 3   source               284580 non-null  object
 4   total_pages_visited  284580 non-null  int64 
 5   converted            284580 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 13.0+ MB
None 



In [377]:
print(f"Number of null val :")
print(100 * df.isnull().sum() / df.shape[0])
# print (df.isnull().any().any())

Number of null val :
country                0.0
age                    0.0
new_user               0.0
source                 0.0
total_pages_visited    0.0
converted              0.0
dtype: float64


In [378]:
print("Duplicates     : ", df.duplicated().sum())
print("Col duplicated : ", df.columns.duplicated() )

print()
print("Unique countries : ", df["country"].unique())
print("Unique sources   : ", df["source"].unique())


Duplicates     :  268769
Col duplicated :  [False False False False False False]

Unique countries :  ['China' 'UK' 'Germany' 'US']
Unique sources   :  ['Direct' 'Ads' 'Seo']


## Pre-preprocessing on df

In [379]:
bPrepocess_df = True            # this flab indicates to do pre-preprocessing or not

In [381]:

def pre_preprocessing(df):
  df['poids'] = df.groupby(df.columns.tolist(), sort=False).transform('size')
  return df


In [380]:

# It is pre_preprocessing since it impact the number of rows
# This mean it MUST be done on df 
# Indeed later, preprocessing has acces to X only and so rows cannot be changed

def pre_preprocessing2(df):
  
  # print(f"shape : {df.shape}\n")
  # df.drop_duplicates(inplace=True)
  # print(f"shape : {df.shape}\n")

  # print(f"shape : {df.shape}")

  # Créer une colonne avec l poids des doublons
  # Supprimer les doublons
  df_no_duplicates = df.drop_duplicates()

  # Compter le nombre d'occurrences de chaque ligne dans le DataFrame d'origine
  counts = df.groupby(df.columns.tolist()).size().reset_index(name='poids')

  # Fusionner la colonne occurences avec le df sans doublons
  df = pd.merge(df_no_duplicates, counts, on=df.columns.tolist(), how='left')

  # print(f"shape : {df.shape}")


  # sample va faire un échantillonnage de m lignes dans df
  # ATTENTION ca n'aide pas à comparer les résultats
  # df = df.sample(int(k_samples_ratio*len(df)))

  # Un peu dans le même esprit
  # Afin d'accélerer les tests preliminaires on peut ne retenir que les m 1ere lignes de df
  # Les les résultats sont constants car on prend toujours les mêmes lignes
  # df_nouveau = df.iloc[:int(k_samples_ratio*len(df))]



  return df  


In [382]:
if(bPrepocess_df):
  print(f"shape : {df.shape}")
  df = pre_preprocessing(df)  
  print(f"shape : {df.shape}")


shape : (284580, 6)
shape : (284580, 7)


In [383]:
# X = df.drop(k_target, axis=1)
X = df.drop(columns=k_target)

y = df[k_target]


In [384]:
print("X :")
print(X.head())
print(X.shape)
print()

print("y :")
print(y.head())

X :
   country  age  new_user  source  total_pages_visited  poids
0    China   22         1  Direct                    2     71
1       UK   21         1     Ads                    3     44
2  Germany   20         0     Seo                   14      6
3       US   23         1     Seo                    3    253
4       US   28         1  Direct                    3    151
(284580, 6)

y :
0    0
1    0
2    1
3    0
4    0
Name: converted, dtype: int64


In [385]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=k_test_size, random_state=k_random_state, stratify = y)


In [386]:

# TODO Faire une fonction
print(f"Shape of X_train : {X_train.shape}")
print()
print(f"Shape of X_train : {X_test.shape}")
print()
print(f"Shape of X_train : {y_train.shape}")
print()
print(f"Shape of X_train : {y_test.shape}")
print()


Shape of X_train : (227664, 6)

Shape of X_train : (56916, 6)

Shape of X_train : (227664,)

Shape of X_train : (56916,)



In [387]:
numeric_features = X.select_dtypes(include="number").columns
print(numeric_features)

categorical_features = X.select_dtypes(exclude="number").columns
print(categorical_features)



Index(['age', 'new_user', 'total_pages_visited', 'poids'], dtype='object')
Index(['country', 'source'], dtype='object')


In [388]:

numeric_transformer = Pipeline(
  steps=[
    # ("imputer_num", SimpleImputer()),
    ("imputer_num", SimpleImputer(strategy="mean")),
    ("scaler_num", StandardScaler()),
  ]
)

categorical_transformer = Pipeline(
  steps=[
      # ("imputer_cat", SimpleImputer(fill_value="missing", strategy="constant")),  
      ("imputer_cat", SimpleImputer(strategy="most_frequent")),  
      ("encoder_cat", OneHotEncoder(drop="first")),                 
      # ("encoder_cat", OneHotEncoder(handle_unknown='ignore', sparse=False)),                 
    ]
  )

preprocessor = ColumnTransformer(
  transformers=[
      ("num", numeric_transformer,     numeric_features),
      ("cat", categorical_transformer, categorical_features),
    ]
  )


In [389]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
print(X_train[0:5].round(3))

# ! IMPORTANT because we work with df NOT nd array most of the time
X_train = pd.DataFrame(X_train, columns=preprocessor.get_feature_names_out())
display(X_train.head())

X_test = pd.DataFrame(X_test, columns=preprocessor.get_feature_names_out())

[[-1.277  0.676 -0.262  0.23   0.     0.     1.     0.     0.   ]
 [-0.189  0.676 -0.561  0.266  0.     0.     0.     0.     0.   ]
 [ 0.657 -1.479 -0.561  0.364  0.     0.     1.     0.     1.   ]
 [-0.914  0.676  0.934  0.327  0.     0.     1.     0.     1.   ]
 [ 1.262  0.676 -0.561 -0.784  0.     1.     0.     0.     0.   ]]


Unnamed: 0,num__age,num__new_user,num__total_pages_visited,num__poids,cat__country_Germany,cat__country_UK,cat__country_US,cat__source_Direct,cat__source_Seo
0,-1.276505,0.67613,-0.261847,0.229605,0.0,0.0,1.0,0.0,0.0
1,-0.188671,0.67613,-0.560909,0.26625,0.0,0.0,0.0,0.0,0.0
2,0.657423,-1.479005,-0.560909,0.36397,0.0,0.0,1.0,0.0,1.0
3,-0.913893,0.67613,0.9344,0.327325,0.0,0.0,1.0,0.0,1.0
4,1.261775,0.67613,-0.560909,-0.784242,0.0,1.0,0.0,0.0,0.0


In [390]:
def apply_feature_engineering(data, strategy='None', **kwargs):

  """
  Applies a feature engineering strategy to the data.      

  Args: 
  - data (DataFrame)  : The DataFrame containing the initial data.     
  - strategy (str)    : The feature engineering strategy to apply.     
  - kwargs            : Parameters specific to the feature engineering strategy.      

  Returns: DataFrame  : The DataFrame containing the transformed data.     
  """


  match strategy:

    case 'None':
      transformed_df = data.copy()
     
    case 'polynomial_features':
      degree = kwargs.get('degree', 2)          # 2 by default
      poly = PolynomialFeatures(degree=degree)
      transformed_data = poly.fit_transform(data)

      original_feature_names = data.columns
      feature_combinations = poly.powers_

      # Generate names for the new features
      feature_names = [""]
      for feature_combination in feature_combinations[1:]:
          new_feature_name = "*".join([f"{orig_feature}^{power}" if power > 1 else orig_feature for orig_feature, power in zip(original_feature_names, feature_combination)])
          feature_names.append(new_feature_name)

      # new df - transformed features and their names
      transformed_df = pd.DataFrame(transformed_data, columns=feature_names)


      # columns = poly.get_feature_names_out(data.columns)
      # transformed_df = pd.DataFrame(transformed_data, columns=columns)

    case 'log_transform':
      features_to_transform = kwargs.get('features_to_transform', [])
      transformed_df = data.copy()
      transformed_df[features_to_transform] = np.log(data[features_to_transform] + 1) # log neperien

    case 'custom_feature_engineering':
      # Design your own pizza
      # One can use kwargs
      transformed_df = data.copy()
      # Ajouter ici votre logique de feature engineering personnalisée

    case _:
      raise ValueError("Feature engineering strategy not recognized.")

  return transformed_df

In [391]:
def apply_feature_selection(X_train, y_train, X_test, feature_selection_method='None', **kwargs):

  match feature_selection_method:
      case "None":
        X_train_selected_df = X_train
        X_test_selected_df = X_test
               
      case 'SelectKBest':
        k = kwargs.get('k', 10)                            # 10 by default
        if df.shape[1]<k:
          k = df.shape[1]
        selector = SelectKBest(score_func=f_classif, k=k)
        X_train_selected = selector.fit_transform(X_train, y_train)
        X_test_selected = selector.transform(X_test)

        X_train_selected_df = pd.DataFrame(X_train_selected, columns=X_train.columns[selector.get_support()])
        X_test_selected_df = pd.DataFrame(X_test_selected, columns=X_train.columns[selector.get_support()])



      case 'chi2':
        selector = SelectKBest(score_func=chi2)
        X_train_selected = selector.fit_transform(X_train, y_train)
        X_test_selected = selector.transform(X_test)

        X_train_selected_df = pd.DataFrame(X_train_selected, columns=X_train.columns[selector.get_support()])
        X_test_selected_df = pd.DataFrame(X_test_selected, columns=X_train.columns[selector.get_support()])

      case 'custom_feature_selection':
        # Design your own pizza
        # One can use kwargs
        X_train_selected_df = X_train
        X_test_selected_df = X_test
      
      case _:
        raise ValueError("Feature selection method not recognized.")

  return X_train_selected_df, X_test_selected_df


In [392]:
def evaluate_model_scores(model, params, X_train, y_train, X_test, y_test):

  grid_search = GridSearchCV(model, params, cv=5, scoring='f1', n_jobs=-1)
  grid_search.fit(X_train, y_train)

  best_params = grid_search.best_params_

  model.set_params(**best_params)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  scores = {
    'accuracy'  : accuracy_score(y_test, y_pred),
    'precision' : precision_score(y_test, y_pred),
    'recall'    : recall_score(y_test, y_pred),
    'f1'        : f1_score(y_test, y_pred)
  }

  return scores


In [393]:
# DataFrame to store results
results_df = pd.DataFrame(columns=['Feature_Engineering', 'Feature_Selection', 'Model', 'Accuracy', 'Precision', 'Recall', 'F1'])



# Features Engineering strategies

In [394]:
# Strategies for feature engineering
feature_engineering_strategies = [
  'None',
  # 'polynomial_features',         # degree
  # 'log_transform',             # features_to_transform
]

engineering_params = {
  'None'                : {},
  'polynomial_features' : {'degree':2},                         
  'log_transform'       : {'features_to_transform': [0, 1, 2]}, # ! PAS TESTE !!!!!!!!!!!!!!!!!!!! 
}


# Features Selection Strategies

In [395]:
# Strategies for feature selection
feature_selection_strategies = [
  ('None', 'None'),
  # ('SelectKBest_2', "SelectKBest"), 
  # ("SelectKBest_1", 'SelectKBest') , 
  # 'chi2',                            # !!! PAS TESTE
]

selection_params_sets = {
  'None'          : {},
  'SelectKBest_1' : {'k':1},                         
  'SelectKBest_2' : {'k':2},                         
  'chi2'          : {}, 
}


# Models & Hyperparameters

In [396]:
# A model = a model id and a model function
models = [
    #("LogisticRegression_0", LogisticRegression()),
    # ("LogisticRegression_1", LogisticRegression()),
    # ("LogisticRegression_2", LogisticRegression()),
    #('Random Forest', RandomForestClassifier()),
    #('XGBoost', XGBClassifier()),
    ("Gradient Boost Clf", GradientBoostingClassifier())
    # ('SVM', SVC()),
    # ('KNN', KNeighborsClassifier()),
    # ('Logistic Regression', LogisticRegression())
]

# Set of hyperparameters for each model_id
models_params = {
    'LogisticRegression_0'  : {},
    'LogisticRegression_1'  : {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
    'LogisticRegression_2'  : {'C': [100], 'max_iter': [1000], 'random_state': [k_random_state]},
    'Random Forest'         : {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]},
    'SVM'                   : {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'KNN'                   : {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']},
    'XGBoost'               : {'booster':['gbtree']},
    # https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
    "Gradient Boost Clf"    : {'learning_rate' : [0.1, 0.01], 'n_estimators': [100, 200], 'subsample' : [1.0, 0.8]},
}



In [397]:
results_lst=[]

for engineering_name in feature_engineering_strategies:
  
  # display(X_train.head(5))
  # print("X_train            : ", type(X_train))

  # Loop over feature engineering
  X_train_engineered = apply_feature_engineering(X_train, engineering_name, **engineering_params[engineering_name])
  X_test_engineered  = apply_feature_engineering(X_test,  engineering_name, **engineering_params[engineering_name])
  # DataFrame
  # display(X_train_engineered.head(5))
  print("X_train_engineered : ", type(X_train_engineered))

  # Loop over features selection
  for selection_id, selection_fn in feature_selection_strategies:
    X_train_selected, X_test_selected = apply_feature_selection(X_train_engineered, y_train, X_test_engineered, selection_fn, **selection_params_sets[selection_id])
    # DataFrame
    # display(X_train_selected.head(5))
    print("X_train_selected   : ", type(X_train_selected))
    
    # Loop over models
    for model_id, model_fn in models:
      print(f"{engineering_name}-{selection_id}-{model_id} : ")
      scores = evaluate_model_scores(model_fn, models_params[model_id], X_train_selected, y_train, X_test_selected, y_test)
      
      results_lst.append(
        {
            'Pre processing'      : bPrepocess_df,
            'Feature_Engineering' : engineering_name,
            'Feature_Selection'   : selection_id,
            'Model'               : model_id,
            'Accuracy'            : scores['accuracy'],
            'Precision'           : scores['precision'],
            'Recall'              : scores['recall'],
            'F1'                  : scores['f1']
        }
      )

results_df = pd.concat([pd.DataFrame([result]) for result in results_lst], ignore_index=True)
display(results_df.sort_values(by="F1", ascending=False))

trailer = datetime.now().strftime("%Y%m%d_%H%M%S")
out_file = "./assets/" + k_result_file + "-" + trailer + ".csv"
results_df.to_csv(out_file, encoding="utf8")


X_train_engineered :  <class 'pandas.core.frame.DataFrame'>
X_train_selected   :  <class 'pandas.core.frame.DataFrame'>
None-None-Gradient Boost Clf : 


Unnamed: 0,Pre processing,Feature_Engineering,Feature_Selection,Model,Accuracy,Precision,Recall,F1
0,True,,,Gradient Boost Clf,0.991373,0.878873,0.849673,0.864027


## Entrainement sur l'ensemble du jeu de données 
* sans le diviser en train et test
* L'idée c'est d'utiliser un max d'observations pour ajuster les paramètres du modèle

In [398]:
X = df.drop(columns = k_target)
y = df[k_target]

display(X.head(2))
print(X.shape)
print(type(X))


Unnamed: 0,country,age,new_user,source,total_pages_visited,poids
0,China,22,1,Direct,2,71
1,UK,21,1,Ads,3,44


(284580, 6)
<class 'pandas.core.frame.DataFrame'>


In [399]:
X = preprocessor.fit_transform(X)
print(X[0:5].round(3))

# ! IMPORTANT because we work with df NOT nd array most of the time
X = pd.DataFrame(X, columns=preprocessor.get_feature_names_out())
display(X_train.head())

print(X.shape)
print(type(X))


[[-1.036  0.677 -0.86  -0.222  0.     0.     0.     1.     0.   ]
 [-1.157  0.677 -0.561 -0.552  0.     1.     0.     0.     0.   ]
 [-1.278 -1.476  2.731 -1.017  1.     0.     0.     0.     1.   ]
 [-0.915  0.677 -0.561  2.002  0.     0.     1.     0.     1.   ]
 [-0.31   0.677 -0.561  0.755  0.     0.     1.     1.     0.   ]]


Unnamed: 0,num__age,num__new_user,num__total_pages_visited,num__poids,cat__country_Germany,cat__country_UK,cat__country_US,cat__source_Direct,cat__source_Seo
0,-1.276505,0.67613,-0.261847,0.229605,0.0,0.0,1.0,0.0,0.0
1,-0.188671,0.67613,-0.560909,0.26625,0.0,0.0,0.0,0.0,0.0
2,0.657423,-1.479005,-0.560909,0.36397,0.0,0.0,1.0,0.0,1.0
3,-0.913893,0.67613,0.9344,0.327325,0.0,0.0,1.0,0.0,1.0
4,1.261775,0.67613,-0.560909,-0.784242,0.0,1.0,0.0,0.0,0.0


(284580, 9)
<class 'pandas.core.frame.DataFrame'>


In [400]:
# Retrouver les composants du meilleur classifier
# fes       = feature engineering strategy
# fs        = feature selection
# model_id  = model id


id  = results_df['F1'].idxmax()

fes = results_df.at[id, "Feature_Engineering"]
fs  = results_df.at[id, "Feature_Selection"]

# fes = results_df.loc[0,"Feature_Engineering"]
# fs = results_df.loc[0,"Feature_Selection"]

for fs_current, fs_method_current in feature_selection_strategies:
    if  fs == fs_current:
        fs_method = fs_method_current
        break

# model_id = results_df.loc[0,"Model"]
model_id = results_df.at[id, "Model"]
for current_id, current_fn in models:
    if  model_id == current_id:
        model_fn = current_fn
        break

# On rappelle tout le monde dans le bon ordre
X_engineered  = apply_feature_engineering(X, fes, **engineering_params[fes])
print(type(X_engineered))

X_selected, _ = apply_feature_selection(X_engineered, y, X_engineered, fs_method, **selection_params_sets[fs])
print(type(X_selected))

# Replace the call to evaluate_model_scores beacause we need access to best_params_ from here
# scores = evaluate_model_scores(model, models_params[model_name], X_selected, y, X_selected, y)
grid_search = GridSearchCV(model_fn, models_params[model_id], cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_selected, y)

best_params = grid_search.best_params_
model_fn.set_params(**best_params)

model_fn.fit(X_selected, y) # TODO faire  un test. Sans doute à virer

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [401]:
y_pred = model_fn.predict(X)

print(f"f1 \t\t precision \t recall")
print(f"{f1_score(y,  y_pred):.6f} \t {precision_score(y,  y_pred):.6f} \t {recall_score(y,  y_pred):.6f}")

f1 		 precision 	 recall
0.861616 	 0.872293 	 0.851198


## Predictions sur le jeu sans label

In [402]:
df_no_labels = pd.read_csv('./assets/conversion_data_test.csv')
print(type(df_no_labels))
print(df_no_labels.shape)




<class 'pandas.core.frame.DataFrame'>
(31620, 5)


In [403]:
if(bPrepocess_df):
  print(f"shape : {df_no_labels.shape}")
  X_no_labels = pre_preprocessing(df_no_labels) 
  print(f"shape : {X_no_labels.shape}")
else:
  X_no_labels = df.copy()


shape : (31620, 5)
shape : (31620, 6)


In [404]:
X_no_labels = preprocessor.transform(X_no_labels)

print(X_no_labels.shape)
print(X_no_labels[0:5,:].round(3))

(31620, 9)
[[-0.31  -1.476  3.329 -1.078  0.     1.     0.     0.     1.   ]
 [-1.036  0.677  0.038 -1.053  0.     1.     0.     1.     0.   ]
 [ 0.174  0.677 -1.159 -0.956  0.     0.     0.     0.     1.   ]
 [ 0.174  0.677  0.337 -0.846  0.     0.     1.     0.     0.   ]
 [-0.673 -1.476 -0.561 -0.956  0.     0.     0.     0.     1.   ]]


In [405]:
data = {
  'converted': model_fn.predict(X_no_labels)
}

Y_predictions = pd.DataFrame(columns=['converted'], data=data)

trailer         = datetime.now().strftime("%Y%m%d_%H%M%S")
out_file = "./assets/" + k_header + k_author + "-" + trailer + ".csv"
Y_predictions.to_csv(out_file, index=False)



