In [1]:
import pandas as pd
from datetime import datetime


from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score # confusion_matrix, 

from sklearn.preprocessing import OneHotEncoder, StandardScaler     #, LabelEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression #, Perceptron

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


k_target        = "converted"
k_samples_ratio = 100/100   # percentage of observation to be taken into account. Pass 100/100 for final testing 
k_test_size     = 20/100    # see train_test_split
k_random_state  = 42        # you know why...
header          = "conversion_data_test_predictions_"
author          = "PHILIPPE"
trailer         = datetime.now().strftime("%Y%m%d_%H%M%S")


In [2]:
df = pd.read_csv('./assets/conversion_data_train.csv')
df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


In [3]:
print(f"shape : {df.shape}\n")
print()

display(df.describe(include="all").T)

shape : (284580, 6)




Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
country,284580.0,4.0,US,160124.0,,,,,,,
age,284580.0,,,,30.564203,8.266789,17.0,24.0,30.0,36.0,123.0
new_user,284580.0,,,,0.685452,0.464336,0.0,0.0,1.0,1.0,1.0
source,284580.0,3.0,Seo,139477.0,,,,,,,
total_pages_visited,284580.0,,,,4.873252,3.341995,1.0,2.0,4.0,7.0,29.0
converted,284580.0,,,,0.032258,0.176685,0.0,0.0,0.0,0.0,1.0


In [4]:
print(df.info(), "\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284580 entries, 0 to 284579
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   country              284580 non-null  object
 1   age                  284580 non-null  int64 
 2   new_user             284580 non-null  int64 
 3   source               284580 non-null  object
 4   total_pages_visited  284580 non-null  int64 
 5   converted            284580 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 13.0+ MB
None 



In [5]:
print(f"Number of null val :")
print(100 * df.isnull().sum() / df.shape[0])
# print (df.isnull().any().any())

Number of null val :
country                0.0
age                    0.0
new_user               0.0
source                 0.0
total_pages_visited    0.0
converted              0.0
dtype: float64


In [6]:
print("Duplicates     : ", df.duplicated().sum())
print("Col duplicated : ", df.columns.duplicated() )

print()
print("Unique countries : ", df["country"].unique())
print("Unique sources   : ", df["source"].unique())

# print(df["col_name"].value_counts())
# print(df.isnull().sum().sort_values(ascending=False).head(11))
# df[k_target].value_counts()


Duplicates     :  268769
Col duplicated :  [False False False False False False]

Unique countries :  ['China' 'UK' 'Germany' 'US']
Unique sources   :  ['Direct' 'Ads' 'Seo']


## Preprocessing on df

In [7]:
def add_weight_col(df):
  
  # print(f"shape : {df.shape}\n")
  # df.drop_duplicates(inplace=True)
  # print(f"shape : {df.shape}\n")

  # print(f"shape : {df.shape}")

  # Créer une colonne avec l poids des doublons
  # Supprimer les doublons
  df_no_duplicates = df.drop_duplicates()

  # Compter le nombre d'occurrences de chaque ligne dans le DataFrame d'origine
  counts = df.groupby(df.columns.tolist()).size().reset_index(name='poids')

  # Fusionner la colonne occurences avec le df sans doublons
  df = pd.merge(df_no_duplicates, counts, on=df.columns.tolist(), how='left')

  # print(f"shape : {df.shape}")
  # df.to_csv("./assets/pourbench_colab.csv")
  return df  






In [8]:
print(f"shape : {df.shape}")

df = add_weight_col(df)  

print(f"shape : {df.shape}")
 

# On peut ici limiter la taille de df pour aller plus vite par exemple  
# df = df.sample(int(k_samples_ratio*len(df)))
# df = df.iloc[:int(k_samples_ratio*len(df))]


shape : (284580, 6)
shape : (15811, 7)


In [9]:

# Split entre X et y
X = df.drop(columns = k_target)
y = df[k_target]

print("X :")
print(X.head())
print(X.shape)
print()

print("y :")
print(y.head())

# Des dataframes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=k_test_size, random_state=k_random_state, stratify = y)

print(X_train.shape)
print(type(X_train))


X :
   country  age  new_user  source  total_pages_visited  poids
0    China   22         1  Direct                    2     71
1       UK   21         1     Ads                    3     44
2  Germany   20         0     Seo                   14      6
3       US   23         1     Seo                    3    253
4       US   28         1  Direct                    3    151
(15811, 6)

y :
0    0
1    0
2    1
3    0
4    0
Name: converted, dtype: int64
(12648, 6)
<class 'pandas.core.frame.DataFrame'>


In [10]:
numeric_features = X.select_dtypes(include="number").columns
print(numeric_features)

categorical_features = X.select_dtypes(exclude="number").columns
print(categorical_features)

numeric_transformer = Pipeline(
  steps=[
    ("imputer_num", SimpleImputer(strategy="mean")),
    ("scaler_num", StandardScaler()),
  ]
)

categorical_transformer = Pipeline(
  steps=[
      ("imputer_cat", SimpleImputer(strategy="most_frequent")),  
      # ("imputer_cat", SimpleImputer(fill_value="missing", strategy="constant")),  
      ("encoder_cat", OneHotEncoder(drop="first")),                 
      # ("encoder_cat", OneHotEncoder(handle_unknown='ignore', sparse=False)),                 
    ]
  )

preprocessor = ColumnTransformer(
  transformers=[
      ("num", numeric_transformer,     numeric_features),
      ("cat", categorical_transformer, categorical_features),
    ]
  )



Index(['age', 'new_user', 'total_pages_visited', 'poids'], dtype='object')
Index(['country', 'source'], dtype='object')


In [11]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# des nd array
print(X_train[0:5].round(3))
print(X_train.shape)
print(type(X_train))


[[ 1.96   1.001 -0.719 -0.453  0.     0.     0.     0.     0.   ]
 [-1.39   1.001 -0.349  0.671  0.     0.     0.     0.     1.   ]
 [ 0.373  1.001  0.576 -0.453  0.     0.     0.     0.     1.   ]
 [-1.39  -0.999 -1.274  0.138  0.     0.     0.     1.     0.   ]
 [ 0.02   1.001  0.576 -0.425  0.     0.     1.     0.     1.   ]]
(12648, 9)
<class 'numpy.ndarray'>


In [12]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# clf = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=123, solver='lbfgs',tol=.0001, verbose=0, warm_start=False) # 
clf = LogisticRegression(C=100, max_iter=1000, random_state=k_random_state) 

clf.fit(X_train, y_train)



y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [13]:
print(f"f1 \t\t precision \t recall")
print(f"{f1_score(y_test,  y_test_pred):.6f} \t {precision_score(y_test,  y_test_pred):.6f} \t {recall_score(y_test,  y_test_pred):.6f}")

f1 		 precision 	 recall
0.791617 	 0.771295 	 0.813038


## Entrainement sur l'ensemble du jeu de données 
* sans le diviser en train et test
* L'idée c'est d'utiliser un max d'observations pour ajuster les paramètres du modèle

In [14]:
X = df.drop(columns = k_target)
y = df[k_target]

print(X.shape)
print(type(X))


(15811, 6)
<class 'pandas.core.frame.DataFrame'>


In [15]:
X = preprocessor.fit_transform(X)

print(X[0:5].round(3))
print(X.shape)
print(type(X))


[[-1.035  0.999 -1.282  1.481  0.     0.     0.     1.     0.   ]
 [-1.122  0.999 -1.096  0.726  0.     1.     0.     0.     0.   ]
 [-1.21  -1.001  0.945 -0.335  1.     0.     0.     0.     1.   ]
 [-0.947  0.999 -1.096  6.565  0.     0.     1.     0.     1.   ]
 [-0.508  0.999 -1.096  3.715  0.     0.     1.     1.     0.   ]]
(15811, 9)
<class 'numpy.ndarray'>


In [16]:
clf = LogisticRegression(C=100, max_iter=1000, random_state=k_random_state) 


print(X.columns)
clf.fit(X, y)


In [17]:
y_pred = clf.predict(X)

print(f"f1 \t\t precision \t recall")
print(f"{f1_score(y,  y_pred):.6f} \t {precision_score(y,  y_pred):.6f} \t {recall_score(y,  y_pred):.6f}")

f1 		 precision 	 recall
0.799226 	 0.785731 	 0.813192


## Predictions sur le jeu sans label

In [18]:
df_no_labels = pd.read_csv('./assets/conversion_data_test.csv')
print(type(df_no_labels))
print(df_no_labels.shape)

X_no_labels = add_weight_col(df_no_labels)  
print(type(X_no_labels))
print(X_no_labels.shape)



<class 'pandas.core.frame.DataFrame'>
(31620, 5)
<class 'pandas.core.frame.DataFrame'>
(7480, 6)


In [19]:
X_no_labels = preprocessor.transform(X_no_labels)

print(type(X_no_labels))
print(X_no_labels.shape)
print(X_no_labels[0:5,:].round(3))

<class 'numpy.ndarray'>
(7480, 9)
[[-0.508 -1.001  1.316 -0.475  0.     1.     0.     0.     1.   ]
 [-1.035  0.999 -0.725 -0.419  0.     1.     0.     1.     0.   ]
 [-0.156  0.999 -1.468 -0.196  0.     0.     0.     0.     1.   ]
 [-0.156  0.999 -0.54   0.056  0.     0.     1.     0.     0.   ]
 [-0.771 -1.001 -1.096 -0.196  0.     0.     0.     0.     1.   ]]


In [20]:
data = {
  'converted': clf.predict(X_no_labels)
}

Y_predictions = pd.DataFrame(columns=['converted'], data=data)

out_file = "./assets/" + header + author + "-" + trailer + ".csv"
Y_predictions.to_csv(out_file, index=False)

