In [9]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score # confusion_matrix, 

from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures   
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression #, Perceptron

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


k_target        = "converted"
k_samples_ratio = 100/100   # percentage of observation to be taken into account. Pass 100/100 for final testing 
k_test_size     = 20/100    # see train_test_split
k_random_state  = 42        # you know why...
header          = "conversion_data_test_predictions_"
author          = "PHILIPPE"


In [10]:
df = pd.read_csv('./assets/conversion_data_train.csv')

## Entrainement classique sur train et test set

In [11]:
X = df.drop(columns = k_target)
y = df[k_target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=k_test_size, random_state=k_random_state, stratify = y)

In [12]:
numeric_features = X.select_dtypes(include="number").columns
categorical_features = X.select_dtypes(exclude="number").columns

numeric_transformer = Pipeline(
  steps=[
    ("scaler_num", StandardScaler()),
  ]
)

categorical_transformer = Pipeline(
  steps=[
      ("encoder_cat", OneHotEncoder(drop="first")),                 
  ]
)

preprocessor = ColumnTransformer(
  transformers=[
    ("num", numeric_transformer,     numeric_features),
    ("cat", categorical_transformer, categorical_features),
  ]
)

In [13]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [14]:
pipe = Pipeline(steps=[
    ("poly", PolynomialFeatures()),
    ("logit", LogisticRegression())
])

params = {
    "poly__degree"      : [1],
    "logit__C"          : [3, 0.1],
    "logit__penalty"    : ["none", "l2"],
}

gridsearch = GridSearchCV(pipe, param_grid = params, cv = 3, scoring="f1", n_jobs = -1)
gridsearch.fit(X_train, y_train)

best_estimator = gridsearch.best_estimator_
y_pred = best_estimator.predict(X_test)

print(f"f1 \t\t precision \t recall")
print(f"{f1_score(y_test,  y_pred):.6f} \t {precision_score(y_test,  y_pred):.6f} \t {recall_score(y_test,  y_pred):.6f}")




f1 		 precision 	 recall
0.768485 	 0.866120 	 0.690632


## Entrainement sur l'ensemble du jeu de données 

In [15]:
X = df.drop(columns = k_target)
y = df[k_target]

X = preprocessor.fit_transform(X)

In [16]:
gridsearch.fit(X, y)
best_estimator = gridsearch.best_estimator_
y_pred = best_estimator.predict(X)

print(f"f1 \t\t precision \t recall")
print(f"{f1_score(y,  y_pred):.6f} \t {precision_score(y,  y_pred):.6f} \t {recall_score(y,  y_pred):.6f}")



f1 		 precision 	 recall
0.763880 	 0.855176 	 0.690196


## Predictions sur le jeu sans label

In [17]:
X_no_labels = pd.read_csv('./assets/conversion_data_test.csv')
X_no_labels = preprocessor.transform(X_no_labels)

In [18]:
data = {
  'converted': best_estimator.predict(X_no_labels)
}

y_predictions = pd.DataFrame(columns=['converted'], data=data)

trailer         = datetime.now().strftime("%Y%m%d_%H%M%S")
out_file = "./assets/" + header + author + "-" + trailer + ".csv"
y_predictions.to_csv(out_file, index=False)