In [399]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier 
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures   
from sklearn.feature_selection import SelectKBest, f_classif, chi2


k_result_file   = "models-benchmark"
k_target        = "converted"
k_header        = "conversion_data_test_predictions_"
k_author        = "PHILIPPE"
k_random_state  = 42            # I know you know ...
k_test_size     = 20/100        # see train_test_split
k_samples_ratio = 10/100        # percentage of observation to be taken into account. Pass 100/100 for final testing 
k_verbose       = True          # Enable/disable messages


### TODO & IDEAS

#### TODO
* ~~<span style="color:red"><b>BUG</b></span> : le dernier fichier de pred est vide ????~~
    * ~~warning at the very end~~
* see how to transform "preprocessing on df" as a pre processing step
* see how to chain multiple pre processing
* see if it make sense to create a pre processing phase
* ~~change organization in features engineering~~
* ~~verbosity~~
* fix the vocabulary : 
  * Phase, step... 
  * A phase concists of one or more steps
  * Phase and step are ordered
* review variables names

#### IDEAS


<!-- 

<span style="color:red"><b>BUG</b></span> : le dernier fichier de pred est vide ????

 -->

## Get the data

In [400]:
df = pd.read_csv('./assets/conversion_data_train.csv')
df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


## Quick'n dirty EDA

In [401]:
print(f"Shape : {df.shape}")
print()

display(df.describe(include="all").T)


Shape : (284580, 6)



Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
country,284580.0,4.0,US,160124.0,,,,,,,
age,284580.0,,,,30.564203,8.266789,17.0,24.0,30.0,36.0,123.0
new_user,284580.0,,,,0.685452,0.464336,0.0,0.0,1.0,1.0,1.0
source,284580.0,3.0,Seo,139477.0,,,,,,,
total_pages_visited,284580.0,,,,4.873252,3.341995,1.0,2.0,4.0,7.0,29.0
converted,284580.0,,,,0.032258,0.176685,0.0,0.0,0.0,0.0,1.0


In [402]:
print(df.info(), "\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284580 entries, 0 to 284579
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   country              284580 non-null  object
 1   age                  284580 non-null  int64 
 2   new_user             284580 non-null  int64 
 3   source               284580 non-null  object
 4   total_pages_visited  284580 non-null  int64 
 5   converted            284580 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 13.0+ MB
None 



In [403]:
print(f"Number of null values :")
print(100 * df.isnull().sum() / df.shape[0])
# print (df.isnull().any().any())

Number of null values :
country                0.0
age                    0.0
new_user               0.0
source                 0.0
total_pages_visited    0.0
converted              0.0
dtype: float64


In [404]:
print("Duplicates     : ", df.duplicated().sum())
print("Col duplicated : ", df.columns.duplicated() )

print()

print("Unique countries : ", df["country"].unique())
print("Unique sources   : ", df["source"].unique())


Duplicates     :  268769
Col duplicated :  [False False False False False False]

Unique countries :  ['China' 'UK' 'Germany' 'US']
Unique sources   :  ['Direct' 'Ads' 'Seo']


## Pre-preprocessing on df

In [405]:
bPrepocess_df = True            # this flab indicates to do pre-preprocessing or not

In [406]:

# TODO : see how to transform this code as a pre processing step
# TODO : find a way to link as in a receipt differents step in pre processing (what about seleciotb etc.)
def pre_preprocessing(df):
  df['weight'] = df.groupby(df.columns.tolist(), sort=False).transform('size')
  return df


In [407]:
if(bPrepocess_df):
  print(f"shape : {df.shape}") if k_verbose == True else None
  df = pre_preprocessing(df)  
  print(f"shape : {df.shape}") if k_verbose == True else None


shape : (284580, 6)
shape : (284580, 7)


In [408]:
X = df.drop(columns=k_target)
y = df[k_target]


In [409]:
if k_verbose : 
  print("X :")
  print(X.head())
  print(X.shape)
  print()

  print("y :")
  print(y.head())
  

X :
   country  age  new_user  source  total_pages_visited  weight
0    China   22         1  Direct                    2      71
1       UK   21         1     Ads                    3      44
2  Germany   20         0     Seo                   14       6
3       US   23         1     Seo                    3     253
4       US   28         1  Direct                    3     151
(284580, 6)

y :
0    0
1    0
2    1
3    0
4    0
Name: converted, dtype: int64


In [410]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=k_test_size, random_state=k_random_state, stratify = y)


In [411]:
dataframes_dict = {'Xtrain': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}

for df_name in dataframes_dict:
  df_actual = dataframes_dict.get(df_name)
  print(f"Shape of {df_name} : {df_actual.shape}") if k_verbose == True else None
  

Shape of Xtrain : (227664, 6)
Shape of X_test : (56916, 6)
Shape of y_train : (227664,)
Shape of y_test : (56916,)


In [412]:
numeric_features = X.select_dtypes(include="number").columns
print(numeric_features) if k_verbose == True else None

categorical_features = X.select_dtypes(exclude="number").columns
print(categorical_features) if k_verbose == True else None



Index(['age', 'new_user', 'total_pages_visited', 'weight'], dtype='object')
Index(['country', 'source'], dtype='object')


In [413]:

# I know, I know, imputer are not need here since there are no missing values BUT... 
# You never know
numeric_transformer = Pipeline(
  steps=[
    # ("imputer_num", SimpleImputer()),
    ("imputer_num", SimpleImputer(strategy="mean")),
    ("scaler_num", StandardScaler()),
  ]
)

categorical_transformer = Pipeline(
  steps=[
      # ("imputer_cat", SimpleImputer(fill_value="missing", strategy="constant")),  
      ("imputer_cat", SimpleImputer(strategy="most_frequent")),  
      ("encoder_cat", OneHotEncoder(drop="first")),                 
      # ("encoder_cat", OneHotEncoder(handle_unknown='ignore', sparse=False)),                 
    ]
  )

preprocessor = ColumnTransformer(
  transformers=[
      ("num", numeric_transformer,     numeric_features),
      ("cat", categorical_transformer, categorical_features),
    ]
  )


In [414]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
print(X_train[0:5].round(3))    if k_verbose == True else None

# ! IMPORTANT : because in this script we work with df NOT nd array 
X_train = pd.DataFrame(X_train, columns=preprocessor.get_feature_names_out())
display(X_train.head())         if k_verbose == True else None

X_test = pd.DataFrame(X_test, columns=preprocessor.get_feature_names_out())

[[-1.277  0.676 -0.262  0.23   0.     0.     1.     0.     0.   ]
 [-0.189  0.676 -0.561  0.266  0.     0.     0.     0.     0.   ]
 [ 0.657 -1.479 -0.561  0.364  0.     0.     1.     0.     1.   ]
 [-0.914  0.676  0.934  0.327  0.     0.     1.     0.     1.   ]
 [ 1.262  0.676 -0.561 -0.784  0.     1.     0.     0.     0.   ]]


Unnamed: 0,num__age,num__new_user,num__total_pages_visited,num__weight,cat__country_Germany,cat__country_UK,cat__country_US,cat__source_Direct,cat__source_Seo
0,-1.276505,0.67613,-0.261847,0.229605,0.0,0.0,1.0,0.0,0.0
1,-0.188671,0.67613,-0.560909,0.26625,0.0,0.0,0.0,0.0,0.0
2,0.657423,-1.479005,-0.560909,0.36397,0.0,0.0,1.0,0.0,1.0
3,-0.913893,0.67613,0.9344,0.327325,0.0,0.0,1.0,0.0,1.0
4,1.261775,0.67613,-0.560909,-0.784242,0.0,1.0,0.0,0.0,0.0


In [415]:
def apply_feature_engineering(data, strategy='None', **kwargs):

  """
  Applies a feature engineering strategy to the data.      

  Args: 
  - data (DataFrame)  : The DataFrame containing the initial data.     
  - strategy (str)    : The feature engineering strategy to apply.     
  - kwargs            : Parameters specific to the feature engineering strategy.      

  Returns: DataFrame  : The DataFrame containing the transformed data.     
  """


  match strategy:

    case 'None':
      transformed_df = data.copy()
     
    case 'polynomial_features':
      degree = kwargs.get('degree', 2)          # 2 by default
      poly = PolynomialFeatures(degree=degree)
      transformed_data = poly.fit_transform(data)

      original_feature_names = data.columns
      feature_combinations = poly.powers_

      # Generate names for the new features
      feature_names = [""]
      for feature_combination in feature_combinations[1:]:
          new_feature_name = "*".join([f"{orig_feature}^{power}" if power > 1 else orig_feature for orig_feature, power in zip(original_feature_names, feature_combination)])
          feature_names.append(new_feature_name)

      # new df - transformed features and their names
      transformed_df = pd.DataFrame(transformed_data, columns=feature_names)


      # columns = poly.get_feature_names_out(data.columns)
      # transformed_df = pd.DataFrame(transformed_data, columns=columns)

    case 'log_transform':
      features_to_transform = kwargs.get('features_to_transform', [])
      transformed_df = data.copy()
      transformed_df[features_to_transform] = np.log(data[features_to_transform] + 1) # log neperien

    case 'custom_feature_engineering':
      # Design your own pizza here
      # One can use kwargs
      transformed_df = data.copy()
      

    case _:
      raise ValueError("Feature engineering strategy not recognized.")

  return transformed_df

In [416]:
def apply_feature_selection(X_train, y_train, X_test, feature_selection_method='None', **kwargs):

  """
  Applies a feature selection strategy to the data.      

  Args: 
  - X_train, y_train, X_test (DataFrame)  : the dataframe
  - feature_selection_method (str)        : default None. The delection to be applied     
  - kwargs                                : Parameters specific to the feature selection strategy      

  Returns (DataFrame)                     : The DataFrame containing the selected features     
  """

  match feature_selection_method:
      case "None":
        X_train_selected_df = X_train
        X_test_selected_df = X_test
               
      case 'SelectKBest':
        k = kwargs.get('k', 10)                            # 10 by default
        if df.shape[1]<k:
          k = df.shape[1]
        selector = SelectKBest(score_func=f_classif, k=k)
        X_train_selected = selector.fit_transform(X_train, y_train)
        X_test_selected = selector.transform(X_test)

        X_train_selected_df = pd.DataFrame(X_train_selected, columns=X_train.columns[selector.get_support()])
        X_test_selected_df = pd.DataFrame(X_test_selected, columns=X_train.columns[selector.get_support()])

      case 'chi2':
        selector = SelectKBest(score_func=chi2)
        X_train_selected = selector.fit_transform(X_train, y_train)
        X_test_selected = selector.transform(X_test)

        X_train_selected_df = pd.DataFrame(X_train_selected, columns=X_train.columns[selector.get_support()])
        X_test_selected_df = pd.DataFrame(X_test_selected, columns=X_train.columns[selector.get_support()])

      case 'custom_feature_selection':
        # Design your own pizza here
        # One can use kwargs
        X_train_selected_df = X_train
        X_test_selected_df = X_test
      
      case _:
        raise ValueError("Feature selection method not recognized.")

  return X_train_selected_df, X_test_selected_df


In [417]:
def evaluate_model_scores(model, params, X_train, y_train, X_test, y_test):

  grid_search = GridSearchCV(model, params, cv=5, scoring='f1', n_jobs=-1)
  grid_search.fit(X_train, y_train)

  best_params = grid_search.best_params_

  model.set_params(**best_params)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  scores = {
    'accuracy'  : accuracy_score(y_test, y_pred),
    'precision' : precision_score(y_test, y_pred),
    'recall'    : recall_score(y_test, y_pred),
    'f1'        : f1_score(y_test, y_pred)
  }

  return scores


In [418]:
# A dataframe to store the results
results_df = pd.DataFrame(columns=["Pre_processing", 'Feature_Engineering', 'Feature_Selection', 'Model', 'Accuracy', 'Precision', 'Recall', 'F1'])



# Features Engineering Strategies

In [419]:
# Strategies for feature engineering
# At least one strategy MUST be active
# fes stands for features engineering strategies
# One fes = fes id + fes function
# You can define as many selection id as you like
# Make sure to define a set of parameters (even if there is no parameters) for each selection id
feature_engineering_strategies = [
  ('None'           , "None"),
  # ("Poly Feat"      , "polynomial_features"),       # degree
  # ("Log Transform"  , "log_transform"),             # features_to_transform
]

# Define paramter for each strategy
# You can let them uncommented
engineering_params_sets = {
  'None'            : {},
  'Poly Feat'       : {'degree':2},                         
  'Log Transform'   : {'features_to_transform': [0, 1, 2]}, # ! NOT TESTED!!!!!!!!!!!!!! 
}


# Features Selection Strategies

In [420]:
# Strategies for feature selection
# At least one strategy MUST be active
# A selection = a selection id + selection function
# You can define as many selection id as you like
# Make sure to define a set of parameters (even if there is no parameters) for each selection id
feature_selection_strategies = [
  ('None', 'None'),
  # ('SelectKBest_2', "SelectKBest"), 
  # ("SelectKBest_1", 'SelectKBest') , 
  # 'chi2',                            # !!! PAS TESTE
]

# Define paraameters for each strategy
# You can let them uncommented
selection_params_sets = {
  'None'          : {},
  'SelectKBest_1' : {'k':1},                         
  'SelectKBest_2' : {'k':2},                         
  'chi2'          : {}, 
}


# Models & Hyperparameters

In [421]:
# A model = a model id and a model function
# You can define as many model id as you like
# Make sure to define a set of parameters (even if there is no parameters) for each model id

models = [
    ("LogisticRegression_0", LogisticRegression()),
    # ("LogisticRegression_1", LogisticRegression()),
    # ("LogisticRegression_2", LogisticRegression()),
    # ('Random Forest', RandomForestClassifier()),
    # ('XGBoost', XGBClassifier()),
    # ("Gradient Boost Clf", GradientBoostingClassifier())
    # ('SVM', SVC()),
    # ('KNN', KNeighborsClassifier()),
    # ('Logistic Regression', LogisticRegression())
]

# Set of hyperparameters for each model_id
models_params_sets = {
    'LogisticRegression_0'  : {},          # baseline model
    'LogisticRegression_1'  : {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
    'LogisticRegression_2'  : {'C': [100], 'max_iter': [1000], 'random_state': [k_random_state]},
    'Random Forest'         : {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]},
    'SVM'                   : {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'KNN'                   : {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']},
    'XGBoost'               : {'booster':['gbtree']},
    # https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
    "Gradient Boost Clf"    : {'learning_rate' : [0.1, 0.01], 'n_estimators': [100, 200], 'subsample' : [1.0, 0.8]},
}



In [422]:
results_lst=[]

# fes = Feature Engineering Strategy
for fes_id, fes_fn in feature_engineering_strategies:
  
  display(X_train.head(5))                        if k_verbose == True else None
  print("X_train            : ", type(X_train))   if k_verbose == True else None

  # Loop over feature engineering
  X_train_engineered = apply_feature_engineering(X_train, fes_fn, **engineering_params_sets[fes_id])
  X_test_engineered  = apply_feature_engineering(X_test,  fes_fn, **engineering_params_sets[fes_id])
  # DataFrame
  display(X_train_engineered.head(5))                       if k_verbose == True else None
  print("X_train_engineered : ", type(X_train_engineered))  if k_verbose == True else None

  # Loop over features selection
  for selection_id, selection_fn in feature_selection_strategies:
    X_train_selected, X_test_selected = apply_feature_selection(X_train_engineered, y_train, X_test_engineered, selection_fn, **selection_params_sets[selection_id])
    # DataFrame
    display(X_train_selected.head(5))                       if k_verbose == True else None
    print("X_train_selected   : ", type(X_train_selected))  if k_verbose == True else None
    
    # Loop over models
    for model_id, model_fn in models:
      print(f"{fes_id}-{selection_id}-{model_id} : ")
      scores = evaluate_model_scores(model_fn, models_params_sets[model_id], X_train_selected, y_train, X_test_selected, y_test)
      
      results_lst.append(
        {
          'Pre_processing'      : bPrepocess_df,
          'Feature_Engineering' : fes_id,
          'Feature_Selection'   : selection_id,
          'Model'               : model_id,
          'Accuracy'            : scores['accuracy'],
          'Precision'           : scores['precision'],
          'Recall'              : scores['recall'],
          'F1'                  : scores['f1']
        }
      )

results_df = pd.concat([pd.DataFrame([result]) for result in results_lst], ignore_index=True)
display(results_df.sort_values(by="F1", ascending=False))

trailer = datetime.now().strftime("%Y%m%d_%H%M%S")
out_file = "./assets/" + k_result_file + "-" + trailer + ".csv"
results_df.to_csv(out_file, encoding="utf8")


Unnamed: 0,num__age,num__new_user,num__total_pages_visited,num__weight,cat__country_Germany,cat__country_UK,cat__country_US,cat__source_Direct,cat__source_Seo
0,-1.276505,0.67613,-0.261847,0.229605,0.0,0.0,1.0,0.0,0.0
1,-0.188671,0.67613,-0.560909,0.26625,0.0,0.0,0.0,0.0,0.0
2,0.657423,-1.479005,-0.560909,0.36397,0.0,0.0,1.0,0.0,1.0
3,-0.913893,0.67613,0.9344,0.327325,0.0,0.0,1.0,0.0,1.0
4,1.261775,0.67613,-0.560909,-0.784242,0.0,1.0,0.0,0.0,0.0


X_train            :  <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,num__age,num__new_user,num__total_pages_visited,num__weight,cat__country_Germany,cat__country_UK,cat__country_US,cat__source_Direct,cat__source_Seo
0,-1.276505,0.67613,-0.261847,0.229605,0.0,0.0,1.0,0.0,0.0
1,-0.188671,0.67613,-0.560909,0.26625,0.0,0.0,0.0,0.0,0.0
2,0.657423,-1.479005,-0.560909,0.36397,0.0,0.0,1.0,0.0,1.0
3,-0.913893,0.67613,0.9344,0.327325,0.0,0.0,1.0,0.0,1.0
4,1.261775,0.67613,-0.560909,-0.784242,0.0,1.0,0.0,0.0,0.0


X_train_engineered :  <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,num__age,num__new_user,num__total_pages_visited,num__weight,cat__country_Germany,cat__country_UK,cat__country_US,cat__source_Direct,cat__source_Seo
0,-1.276505,0.67613,-0.261847,0.229605,0.0,0.0,1.0,0.0,0.0
1,-0.188671,0.67613,-0.560909,0.26625,0.0,0.0,0.0,0.0,0.0
2,0.657423,-1.479005,-0.560909,0.36397,0.0,0.0,1.0,0.0,1.0
3,-0.913893,0.67613,0.9344,0.327325,0.0,0.0,1.0,0.0,1.0
4,1.261775,0.67613,-0.560909,-0.784242,0.0,1.0,0.0,0.0,0.0


X_train_selected   :  <class 'pandas.core.frame.DataFrame'>
None-None-LogisticRegression_0 : 


Unnamed: 0,Pre_processing,Feature_Engineering,Feature_Selection,Model,Accuracy,Precision,Recall,F1
0,True,,,LogisticRegression_0,0.987455,0.812709,0.794118,0.803306


## Training on the whole dataset 
* No division between train and test set
* The idea is to leverage the maximum of observations to adjest model's parameters

In [423]:
X = df.drop(columns = k_target)
y = df[k_target]

display(X.head(2))  if k_verbose == True else None
print(X.shape)      if k_verbose == True else None
print(type(X))      if k_verbose == True else None


Unnamed: 0,country,age,new_user,source,total_pages_visited,weight
0,China,22,1,Direct,2,71
1,UK,21,1,Ads,3,44


(284580, 6)
<class 'pandas.core.frame.DataFrame'>


In [424]:
X = preprocessor.fit_transform(X)
print(X[0:5].round(3))  if k_verbose == True else None

# ! IMPORTANT because is this code we "play" with df NOT nd arrays
X = pd.DataFrame(X, columns=preprocessor.get_feature_names_out())
display(X_train.head()) if k_verbose == True else None

print(X.shape)          if k_verbose == True else None
print(type(X))          if k_verbose == True else None


[[-1.036  0.677 -0.86  -0.222  0.     0.     0.     1.     0.   ]
 [-1.157  0.677 -0.561 -0.552  0.     1.     0.     0.     0.   ]
 [-1.278 -1.476  2.731 -1.017  1.     0.     0.     0.     1.   ]
 [-0.915  0.677 -0.561  2.002  0.     0.     1.     0.     1.   ]
 [-0.31   0.677 -0.561  0.755  0.     0.     1.     1.     0.   ]]


Unnamed: 0,num__age,num__new_user,num__total_pages_visited,num__weight,cat__country_Germany,cat__country_UK,cat__country_US,cat__source_Direct,cat__source_Seo
0,-1.276505,0.67613,-0.261847,0.229605,0.0,0.0,1.0,0.0,0.0
1,-0.188671,0.67613,-0.560909,0.26625,0.0,0.0,0.0,0.0,0.0
2,0.657423,-1.479005,-0.560909,0.36397,0.0,0.0,1.0,0.0,1.0
3,-0.913893,0.67613,0.9344,0.327325,0.0,0.0,1.0,0.0,1.0
4,1.261775,0.67613,-0.560909,-0.784242,0.0,1.0,0.0,0.0,0.0


(284580, 9)
<class 'pandas.core.frame.DataFrame'>


In [425]:
# Retreive the settings of the best classifier
# fes       = feature engineering strategy
# fs        = feature selection
# model_id  = model id

id  = results_df['F1'].idxmax()

fes_id = results_df.at[id, "Feature_Engineering"]
for fes_current_id, fes_current_fn in feature_selection_strategies:
    if  fes_id == fes_current_id:
        fes_fn = fes_current_fn
        break

fs_id  = results_df.at[id, "Feature_Selection"]
for fs_current_id, fs_current_fn in feature_selection_strategies:
    if  fs_id == fs_current_id:
        fs_fn = fs_current_fn
        break

model_id = results_df.at[id, "Model"]
for model_current_id, model_current_fn in models:
    if  model_id == model_current_id:
        model_fn = model_current_fn
        break

# Now we got the parameters. We go through the whole process again
X_engineered  = apply_feature_engineering(X, fes_fn, **engineering_params_sets[fes_id])
print(type(X_engineered))   if k_verbose == True else None

X_selected, _ = apply_feature_selection(X_engineered, y, X_engineered, fs_fn, **selection_params_sets[fs_id])
print(type(X_selected))     if k_verbose == True else None

# We call explictily GridSearchCV from here because we need to access to best_params_ 
grid_search = GridSearchCV(model_fn, models_params_sets[model_id], cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_selected, y)


# TODO make a test. We should be able to remove those line
# TODO I suspect the model keep the best params

best_params = grid_search.best_params_
model_fn.set_params(**best_params)

print(X_selected.columns)   if k_verbose == True else None

model_fn.fit(X_selected, y) 

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
Index(['num__age', 'num__new_user', 'num__total_pages_visited', 'num__weight',
       'cat__country_Germany', 'cat__country_UK', 'cat__country_US',
       'cat__source_Direct', 'cat__source_Seo'],
      dtype='object')


In [426]:
y_pred = model_fn.predict(X)
print(y_pred)                 if k_verbose == True else None


print(f"f1 \t\t precision \t recall")
print(f"{f1_score(y,  y_pred):.6f} \t {precision_score(y,  y_pred):.6f} \t {recall_score(y,  y_pred):.6f}")

[0 0 1 ... 0 0 0]
f1 		 precision 	 recall
0.799912 	 0.806241 	 0.793682


## Predictions on the dataset without label

In [427]:
df_no_labels = pd.read_csv('./assets/conversion_data_test.csv')
print(type(df_no_labels))   if k_verbose == True else None
print(df_no_labels.shape)   if k_verbose == True else None


<class 'pandas.core.frame.DataFrame'>
(31620, 5)


In [428]:
if(bPrepocess_df):
  print(f"shape : {df_no_labels.shape}")          if k_verbose == True else None
  X_no_labels = pre_preprocessing(df_no_labels) 
  print(f"shape : {X_no_labels.shape}")           if k_verbose == True else None
else:
  X_no_labels = df_no_labels.copy()

print(type(X_no_labels))                          if k_verbose == True else None

shape : (31620, 5)
shape : (31620, 6)
<class 'pandas.core.frame.DataFrame'>


In [429]:
X_no_labels = preprocessor.transform(X_no_labels)
print(X_no_labels.shape)            if k_verbose == True else None

# ! IMPORTANT : because in this script we work with df NOT nd array 
X_no_labels = pd.DataFrame(X_no_labels, columns=preprocessor.get_feature_names_out())

print(type(X_no_labels))            if k_verbose == True else None
print(X_no_labels.shape)            if k_verbose == True else None
# print(X_no_labels[0:5,:].round(3))  if k_verbose == True else None

(31620, 9)
<class 'pandas.core.frame.DataFrame'>
(31620, 9)


In [430]:
print(X_no_labels.columns)   if k_verbose == True else None
y_no_labels = model_fn.predict(X_no_labels)

display(y_no_labels)
print(y_no_labels.shape)


Index(['num__age', 'num__new_user', 'num__total_pages_visited', 'num__weight',
       'cat__country_Germany', 'cat__country_UK', 'cat__country_US',
       'cat__source_Direct', 'cat__source_Seo'],
      dtype='object')


array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

(31620,)


In [431]:
# https://stackoverflow.com/questions/73345752/linear-regression-predict-error-userwarning-x-does-not-have-valid-feature-na


data = {
  'converted' : model_fn.predict(X_no_labels)
}

y_predictions = pd.DataFrame(columns=['converted'], data = data)

trailer         = datetime.now().strftime("%Y%m%d_%H%M%S")
out_file = "./assets/" + k_header + k_author + "-" + trailer + ".csv"
y_predictions.to_csv(out_file, index=False)

