In [224]:
import pandas as pd
import numpy as np
from datetime import datetime


from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures   
from sklearn.feature_selection import SelectKBest, f_classif, chi2

k_result_file   = "results"
k_target        = "converted"
k_samples_ratio = 10/100  # percentage of observation to be taken into account. Pass 100/100 for final testing 
k_test_size     = 20/100  # see train_test_split
k_random_state  = 42      # you know why...


In [225]:
df = pd.read_csv('./assets/conversion_data_train.csv')
df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


In [226]:
print(f"Shape :")
print(df.shape)
print()

print(f"Numnber of null val :")
print(100 * df.isnull().sum() / df.shape[0])

print(f"\nNumber of unique category :")
print("Unique countries : ", df["country"].unique())
print("Unique sources   : ", df["source"].unique())

# display(df.head())
# print(df.describe(include="all").T)
# print(df.duplicated().sum())
# print (df.isnull().any().any())
# print(f"Info :\n", df.info(), "\n")
# print(df["col_name"].value_counts())
# print(df.isnull().sum().sort_values(ascending=False).head(11))
# df[k_target].value_counts()


Shape :
(284580, 6)

Numnber of null val :
country                0.0
age                    0.0
new_user               0.0
source                 0.0
total_pages_visited    0.0
converted              0.0
dtype: float64

Number of unique category :
Unique countries :  ['China' 'UK' 'Germany' 'US']
Unique sources   :  ['Direct' 'Ads' 'Seo']


In [227]:
# df = df.sample(int(k_samples_ratio*len(df)))
df_nouveau = df.iloc[:int(k_samples_ratio*len(df))]


X = df.drop(k_target, axis=1)
X = df.drop(columns=k_target)

y = df[k_target]


In [228]:
print("X :")
print(X.head())
print(X.shape)
print()

print("y :")
print(y.head())

X :
   country  age  new_user  source  total_pages_visited
0    China   22         1  Direct                    2
1       UK   21         1     Ads                    3
2  Germany   20         0     Seo                   14
3       US   23         1     Seo                    3
4       US   28         1  Direct                    3
(284580, 5)

y :
0    0
1    0
2    1
3    0
4    0
Name: converted, dtype: int64


In [229]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=k_test_size, random_state=k_random_state, stratify = y)


In [230]:

# TODO Faire une fonction
print(f"Shape of X_train : {X_train.shape}")
print()
print(f"Shape of X_train : {X_test.shape}")
print()
print(f"Shape of X_train : {y_train.shape}")
print()
print(f"Shape of X_train : {y_test.shape}")
print()


Shape of X_train : (227664, 5)

Shape of X_train : (56916, 5)

Shape of X_train : (227664,)

Shape of X_train : (56916,)



In [231]:
numeric_features = X.select_dtypes(include="number").columns
print(numeric_features)

categorical_features = X.select_dtypes(exclude="number").columns
print(categorical_features)



Index(['age', 'new_user', 'total_pages_visited'], dtype='object')
Index(['country', 'source'], dtype='object')


In [232]:

numeric_transformer = Pipeline(
  steps=[
    ("imputer_num", SimpleImputer()),
    ("scaler_num", StandardScaler()),
  ]
)

categorical_transformer = Pipeline(
  steps=[
      ("imputer_cat", SimpleImputer(fill_value="missing", strategy="constant")),  
      # ("encoder_cat", OneHotEncoder(drop="first")),                 
      ("encoder_cat", OneHotEncoder(handle_unknown='ignore', sparse=False)),                 
    ]
  )

preprocessor = ColumnTransformer(
  transformers=[
      ("num", numeric_transformer,     numeric_features),
      ("cat", categorical_transformer, categorical_features),
    ]
  )


In [233]:
X_train = preprocessor.fit_transform(X_train)
print(X_train[0:5])
X_train = pd.DataFrame(X_train, columns=preprocessor.get_feature_names_out())
display(X_train.head())

X_test = preprocessor.transform(X_test)
X_test = pd.DataFrame(X_test, columns=preprocessor.get_feature_names_out())

[[-1.27650481  0.6761303  -0.2618471   0.          0.          0.
   1.          1.          0.          0.        ]
 [-0.18867057  0.6761303  -0.56090876  1.          0.          0.
   0.          1.          0.          0.        ]
 [ 0.65742272 -1.47900486 -0.56090876  0.          0.          0.
   1.          0.          0.          1.        ]
 [-0.9138934   0.6761303   0.93439955  0.          0.          0.
   1.          0.          0.          1.        ]
 [ 1.26177508  0.6761303  -0.56090876  0.          0.          1.
   0.          1.          0.          0.        ]]




Unnamed: 0,num__age,num__new_user,num__total_pages_visited,cat__country_China,cat__country_Germany,cat__country_UK,cat__country_US,cat__source_Ads,cat__source_Direct,cat__source_Seo
0,-1.276505,0.67613,-0.261847,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,-0.188671,0.67613,-0.560909,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.657423,-1.479005,-0.560909,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,-0.913893,0.67613,0.9344,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.261775,0.67613,-0.560909,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [234]:
# print(X_train[5:,].round(3))

# print()
# print(X_test[5:,].round(3))


X_train.head()

Unnamed: 0,num__age,num__new_user,num__total_pages_visited,cat__country_China,cat__country_Germany,cat__country_UK,cat__country_US,cat__source_Ads,cat__source_Direct,cat__source_Seo
0,-1.276505,0.67613,-0.261847,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,-0.188671,0.67613,-0.560909,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.657423,-1.479005,-0.560909,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,-0.913893,0.67613,0.9344,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.261775,0.67613,-0.560909,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [235]:
def apply_feature_engineering(data, strategy='None', **kwargs):

  """
  Applies a feature engineering strategy to the data.      

  Args: 
  - data (DataFrame)  : The DataFrame containing the initial data.     
  - strategy (str)    : The feature engineering strategy to apply.     
  - kwargs            : Parameters specific to the feature engineering strategy.      

  Returns: DataFrame  : The DataFrame containing the transformed data.     
  """


  match strategy:

    case 'None':
      transformed_df = data.copy()
     
    case 'polynomial_features':
      degree = kwargs.get('degree', 2)          # 2 by default
      poly = PolynomialFeatures(degree=degree)
      transformed_data = poly.fit_transform(data)

      original_feature_names = data.columns
      feature_combinations = poly.powers_

      # Generate names for the new features
      feature_names = [""]
      for feature_combination in feature_combinations[1:]:
          new_feature_name = "*".join([f"{orig_feature}^{power}" if power > 1 else orig_feature for orig_feature, power in zip(original_feature_names, feature_combination)])
          feature_names.append(new_feature_name)

      # new df - transformed features and their names
      transformed_df = pd.DataFrame(transformed_data, columns=feature_names)


      # columns = poly.get_feature_names_out(data.columns)
      # transformed_df = pd.DataFrame(transformed_data, columns=columns)

    case 'log_transform':
      features_to_transform = kwargs.get('features_to_transform', [])
      transformed_df = data.copy()
      transformed_df[features_to_transform] = np.log(data[features_to_transform] + 1) # log neperien

    case 'custom_feature_engineering':
      # Design your own pizza
      # One can use kwargs
      transformed_df = data.copy()
      # Ajouter ici votre logique de feature engineering personnalisée

    case _:
      raise ValueError("Feature engineering strategy not recognized.")

  return transformed_df

In [236]:
def apply_feature_selection(X_train, y_train, X_test, feature_selection_method='None', **kwargs):

  match feature_selection_method:
      case "None":
        X_train_selected_df = X_train
        X_test_selected_df = X_test
               
      case 'SelectKBest':
        k = kwargs.get('k', 10)                            # 10 by default
        if df.shape[1]<k:
          k = df.shape[1]
        selector = SelectKBest(score_func=f_classif, k=k)
        X_train_selected = selector.fit_transform(X_train, y_train)
        X_test_selected = selector.transform(X_test)

        X_train_selected_df = pd.DataFrame(X_train_selected, columns=X_train.columns[selector.get_support()])
        X_test_selected_df = pd.DataFrame(X_test_selected, columns=X_train.columns[selector.get_support()])



      case 'chi2':
        selector = SelectKBest(score_func=chi2)
        X_train_selected = selector.fit_transform(X_train, y_train)
        X_test_selected = selector.transform(X_test)

        X_train_selected_df = pd.DataFrame(X_train_selected, columns=X_train.columns[selector.get_support()])
        X_test_selected_df = pd.DataFrame(X_test_selected, columns=X_train.columns[selector.get_support()])

      case 'custom_feature_selection':
        # Design your own pizza
        # One can use kwargs
        X_train_selected_df = X_train
        X_test_selected_df = X_test
      
      case _:
        raise ValueError("Feature selection method not recognized.")

  return X_train_selected_df, X_test_selected_df


In [237]:
def evaluate_model_scores(model, params, X_train, y_train, X_test, y_test):

  grid_search = GridSearchCV(model, params, cv=5, scoring='f1', n_jobs=-1)
  grid_search.fit(X_train, y_train)

  best_params = grid_search.best_params_

  model.set_params(**best_params)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  scores = {
    'accuracy'  : accuracy_score(y_test, y_pred),
    'precision' : precision_score(y_test, y_pred),
    'recall'    : recall_score(y_test, y_pred),
    'f1'        : f1_score(y_test, y_pred)
  }

  return scores


In [238]:
# DataFrame to store results
results_df = pd.DataFrame(columns=['Feature_Engineering', 'Feature_Selection', 'Model', 'Accuracy', 'Precision', 'Recall', 'F1'])



# Features Engineering strategies

In [239]:
# Strategies for feature engineering
feature_engineering_strategies = [
  'None',
  'polynomial_features',         # degree
  # 'log_transform',             # features_to_transform
]

engineering_params = {
  'None'                : {},
  'polynomial_features' : {'degree':2},                         
  'log_transform'       : {'features_to_transform': [0, 1, 2]}, # ! PAS TESTE !!!!!!!!!!!!!!!!!!!! 
}


# Features Selection Strategies

In [240]:
# Strategies for feature selection
feature_selection_strategies = [
  ('None', 'None'),
  ("SelectKBest", 'SelectKBest_2'), 
  ('SelectKBest', "SelectKBest_1") , 
  # 'chi2',                            # !!! PAS TESTE
]

selection_params_sets = {
  'None'          : {},
  'SelectKBest_1' : {'k':1},                         
  'SelectKBest_2' : {'k':2},                         
  'chi2'          : {}, 
}


# Models & Hyperparameters

In [241]:
# Models
models = [
    ("LogisticRegression_0", LogisticRegression()),
    ("LogisticRegression_1", LogisticRegression()),
    ('Random Forest', RandomForestClassifier()),
    # ('SVM', SVC()),
    # ('KNN', KNeighborsClassifier()),
    # ('Logistic Regression', LogisticRegression())
]

# Hyperparamters to test with each model
models_params = {
    'LogisticRegression_0'  : {},
    'LogisticRegression_1'  : {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
    'Random Forest'         : {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]},
    'SVM'                   : {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'KNN'                   : {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']},
}


In [242]:
results_lst=[]
for engineering_name in feature_engineering_strategies:

  # display(X_train.head(5))
  # print("X_train            : ", type(X_train))

  # Loop over feature engineering
  X_train_engineered = apply_feature_engineering(X_train, engineering_name, **engineering_params[engineering_name])
  X_test_engineered  = apply_feature_engineering(X_test,  engineering_name, **engineering_params[engineering_name])
  # display(X_train_engineered.head(5))
  # print("X_train_engineered : ", type(X_train_engineered))

  # Loop over features selection
  for selection_method, selection_params_id in feature_selection_strategies:
    X_train_selected, X_test_selected = apply_feature_selection(X_train_engineered, y_train, X_test_engineered, selection_method, **selection_params_sets[selection_params_id])
    # display(X_train_selected.head(5))
    # print("X_train_selected   : ", type(X_train_selected))
    
    # Loop over models
    for model_name, model in models:
      scores = evaluate_model_scores(model, models_params[model_name], X_train_selected, y_train, X_test_selected, y_test)
      
      results_lst.append(
        {
            'Feature_Engineering': engineering_name,
            'Feature_Selection': selection_params_id,
            'Model': model_name,
            'Accuracy': scores['accuracy'],
            'Precision': scores['precision'],
            'Recall': scores['recall'],
            'F1': scores['f1']
        }
      )

results_df = pd.concat([pd.DataFrame([result]) for result in results_lst], ignore_index=True)
display(results_df.sort_values(by="F1", ascending=False))

out_file = k_result_file + "_" + datetime.now().strftime("%Y%m%d_%H%M%S") + ".csv"
results_df.to_csv(out_file, encoding="utf8")


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\phili\anaconda3\envs\jedha\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\phili\anaconda3\envs\jedha\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\phili\anaconda3\envs\jedha\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^

Unnamed: 0,Feature_Engineering,Feature_Selection,Model,Accuracy,Precision,Recall,F1
0,,,LogisticRegression_0,0.986559,0.865529,0.690632,0.768252
1,,,LogisticRegression_1,0.986559,0.865529,0.690632,0.768252
9,polynomial_features,,LogisticRegression_0,0.986278,0.857627,0.688998,0.76412
10,polynomial_features,,LogisticRegression_1,0.986278,0.858113,0.688453,0.763977
11,polynomial_features,,Random Forest,0.986243,0.858407,0.686819,0.763086
2,,,Random Forest,0.986032,0.85529,0.682462,0.759164
3,,SelectKBest_2,LogisticRegression_0,0.984872,0.846975,0.648148,0.734341
4,,SelectKBest_2,LogisticRegression_1,0.984872,0.846975,0.648148,0.734341
5,,SelectKBest_2,Random Forest,0.984872,0.846975,0.648148,0.734341
6,,SelectKBest_1,LogisticRegression_0,0.983555,0.83532,0.610566,0.705475
