In [264]:
import pandas as pd
from datetime import datetime


from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score # confusion_matrix, 

from sklearn.preprocessing import OneHotEncoder, StandardScaler     #, LabelEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression #, Perceptron

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


k_target        = "converted"
k_samples_ratio = 100/100   # percentage of observation to be taken into account. Pass 100/100 for final testing 
k_test_size     = 20/100    # see train_test_split
k_random_state  = 42        # you know why...
header          = "conversion_data_test_predictions_"
author          = "PHILIPPE"
trailer         = datetime.now().strftime("%Y%m%d_%H%M%S")


In [265]:
df = pd.read_csv('./assets/conversion_data_train.csv')
df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


In [266]:

print(f"shape : {df.shape}\n")

print(df.info(), "\n")

print(f"# of null val :")
print(100 * df.isnull().sum() / df.shape[0])

# display(df.head())
# print(df.describe(include="all").T)
# print(df.duplicated().sum())
# print (df.isnull().any().any())

print()
print("Unique countries : ", df["country"].unique())
print("Unique sources   : ", df["source"].unique())
# print(df["col_name"].value_counts())
# print(df.isnull().sum().sort_values(ascending=False).head(11))
# df[k_target].value_counts()


shape : (284580, 6)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284580 entries, 0 to 284579
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   country              284580 non-null  object
 1   age                  284580 non-null  int64 
 2   new_user             284580 non-null  int64 
 3   source               284580 non-null  object
 4   total_pages_visited  284580 non-null  int64 
 5   converted            284580 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 13.0+ MB
None 

# of null val :
country                0.0
age                    0.0
new_user               0.0
source                 0.0
total_pages_visited    0.0
converted              0.0
dtype: float64

Unique countries :  ['China' 'UK' 'Germany' 'US']
Unique sources   :  ['Direct' 'Ads' 'Seo']


In [267]:
# df = df.sample(int(k_samples_ratio*len(df)))
# df_nouveau = df.iloc[:int(k_samples_ratio*len(df))]

X = df.loc[:, df.columns != k_target]
y = df[k_target]

print("X :")
print(X.head())
print(X.shape)
print()

print("y :")
print(y.head())




X :
   country  age  new_user  source  total_pages_visited
0    China   22         1  Direct                    2
1       UK   21         1     Ads                    3
2  Germany   20         0     Seo                   14
3       US   23         1     Seo                    3
4       US   28         1  Direct                    3
(284580, 5)

y :
0    0
1    0
2    1
3    0
4    0
Name: converted, dtype: int64


In [268]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=k_test_size, random_state=k_random_state, stratify = y)

In [269]:
numeric_features = X.select_dtypes(include="number").columns
print(numeric_features)

categorical_features = X.select_dtypes(exclude="number").columns
print(categorical_features)

numeric_transformer = Pipeline(
  steps=[
    ("imputer_num", SimpleImputer(strategy="mean")),
    ("scaler_num", StandardScaler()),
  ]
)

categorical_transformer = Pipeline(
  steps=[
      ("imputer_cat", SimpleImputer(strategy="most_frequent")),  
      # ("imputer_cat", SimpleImputer(fill_value="missing", strategy="constant")),  
      ("encoder_cat", OneHotEncoder(drop="first")),                 
      # ("encoder_cat", OneHotEncoder(handle_unknown='ignore', sparse=False)),                 
    ]
  )

preprocessor = ColumnTransformer(
  transformers=[
      ("num", numeric_transformer,     numeric_features),
      ("cat", categorical_transformer, categorical_features),
    ]
  )

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
X_train[0:5]


Index(['age', 'new_user', 'total_pages_visited'], dtype='object')
Index(['country', 'source'], dtype='object')


array([[-1.27650481,  0.6761303 , -0.2618471 ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.18867057,  0.6761303 , -0.56090876,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.65742272, -1.47900486, -0.56090876,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ],
       [-0.9138934 ,  0.6761303 ,  0.93439955,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ],
       [ 1.26177508,  0.6761303 , -0.56090876,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ]])

In [270]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# clf = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=123, solver='lbfgs',tol=.0001, verbose=0, warm_start=False) # 
clf = LogisticRegression(C=100, max_iter=1000, random_state=k_random_state) 

clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [271]:
print(f"f1 \t\t precision \t recall")
print(f"{f1_score(y_test,  y_test_pred):.6f} \t {precision_score(y_test,  y_test_pred):.6f} \t {recall_score(y_test,  y_test_pred):.6f}")

f1 		 precision 	 recall
0.768485 	 0.866120 	 0.690632


On refait un entrainement mais sur tout le jeu sans le diviser en train et test

In [272]:
X = df.loc[:, df.columns != k_target]
y = df[k_target]

print(X.shape)
print(type(X))


(284580, 5)
<class 'pandas.core.frame.DataFrame'>


In [273]:
X = preprocessor.fit_transform(X)
X_train[0:5]


array([[-1.27650481,  0.6761303 , -0.2618471 ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.18867057,  0.6761303 , -0.56090876,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.65742272, -1.47900486, -0.56090876,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ],
       [-0.9138934 ,  0.6761303 ,  0.93439955,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ],
       [ 1.26177508,  0.6761303 , -0.56090876,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ]])

In [274]:
clf = LogisticRegression(C=100, max_iter=1000, random_state=k_random_state) 
clf.fit(X, y)


In [275]:
df_no_labels = pd.read_csv('./assets/conversion_data_test.csv')
print(df_no_labels.shape)


X_no_labels = df.drop(columns=k_target)

# Convert pandas DataFrames to numpy arrays before using scikit-learn
# X_no_labels = X_no_labels.values
# print(X_no_labels[0:5,:])
print(X_no_labels.head())

(31620, 5)
   country  age  new_user  source  total_pages_visited
0    China   22         1  Direct                    2
1       UK   21         1     Ads                    3
2  Germany   20         0     Seo                   14
3       US   23         1     Seo                    3
4       US   28         1  Direct                    3


In [276]:
X_no_labels = preprocessor.transform(X_no_labels)

print(df_no_labels.shape)
print(X_no_labels[0:5,:])

(31620, 5)
[[-1.03597878  0.67741482 -0.85974314  0.          0.          0.
   1.          0.        ]
 [-1.15694494  0.67741482 -0.5605201   0.          1.          0.
   0.          0.        ]
 [-1.2779111  -1.47620036  2.73093333  1.          0.          0.
   0.          1.        ]
 [-0.91501263  0.67741482 -0.5605201   0.          0.          1.
   0.          1.        ]
 [-0.31018183  0.67741482 -0.5605201   0.          0.          1.
   1.          0.        ]]


In [277]:



data = {
  'converted': clf.predict(X_no_labels)
}

Y_predictions = pd.DataFrame(columns=['converted'], data=data)

out_file = "./assets/" + header + author + "-" + trailer + ".csv"
Y_predictions.to_csv(out_file, index=False)

