# 1. Preliminary

## 1.1 Context

blabla

## 1.2 Requirements

* pandas
* numpy


* scikit-learn


* matplotlib

## 1.3 Imports

In [None]:
# builtin 
import os


# data
import pandas as pd
import numpy as np


# preprocessing
from sklearn.preprocessing import LabelEncoder
# metrics
from sklearn.metrics import roc_curve, auc, confusion_matrix
# estimators
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
# metrics
from sklearn.metrics import roc_curve, auc

# visualisation
import matplotlib.pyplot as plt


## 1.4 Data

In [None]:
pwd = os.getcwd()

In [None]:
os.listdir()

In [None]:
os.listdir("img")

<img src="img/tree.jpg"
     alt="img/tree.jpg" />

In [None]:
# écrivons 2 varibales pour retrouver nos fichiers

path = "./data/source/"           # mettez votre chemin de fichier
filename = "TP_2_datset_mushrooms.csv"   # mettez le nom de votre fichier .csv

In [None]:
# chargeons nos données
df = pd.read_csv(path + filename)

In [None]:
# verifions que le chargment c'est bien effectué

df.head()

# 2. First Tour

## 2.1 Display

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.sample(10)

## 2.2 Structure

In [None]:
df.info()

In [None]:
df.dtypes.value_counts()

In [None]:
df.nunique()

## 2.3 Nan & Duplicated

In [None]:
df.isna().mean()

In [None]:
df.duplicated().sum()

## 2.4 Data inspection

In [None]:
df.describe()

In [None]:
df.iloc[:, 0].value_counts()

In [None]:
df.iloc[:, 0].value_counts(normalize=True).round(2)

In [None]:
df.iloc[:, 0].value_counts().plot(kind="pie")

# 3. Cleaning

## 3.1 Label encoding 

In [None]:
labelencoder = LabelEncoder()
for col in df.columns:
    df[col] = labelencoder.fit_transform(df[col])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## 3.2 Rename "class" in target

In [None]:
df = df.rename(columns={"class": "target" })
df

# 4. Modelisation

## 4.1 Data preparation

In [None]:
# On récupère les features d'un côté...
X = df.drop(columns="target")

# et les labels de l'autre
y = df.target

In [None]:
X.head()

In [None]:
y.head()

In [None]:
# test train split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# validons la forme de nos données

print(f"Nous avons X_train de forme {X_train.shape} et y_train de forme  {y_train.shape}  ")
print(f"Nous avons X_test de forme {X_test.shape} et y_test de forme  {y_test.shape}  ")

## 4.2 Dummy classifier

In [None]:
estimator = DummyClassifier()
estimator.fit(X_train, y_train)

In [None]:
estimator.score(X_train, y_train).round(2)

In [None]:
estimator.score(X_test, y_test).round(2)

In [None]:
y_pred = estimator.predict(X_test)
y_pred

In [None]:
y_test.value_counts()

In [None]:
pd.Series(y_pred).value_counts()

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)

In [None]:
plt.figure(figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate,true_positive_rate, color='red',label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
mat = confusion_matrix(y_test, y_pred)
mat

In [None]:
mat = pd.DataFrame(mat)
mat

In [None]:
mat = pd.DataFrame(mat)
mat.columns = [f"pred_{i}" for i in mat.columns]
mat.index = [f"test_{i}" for i in mat.index]
mat

## 4.2 Logistic regression

### 4.2.1  Without GridSearch

In [None]:
# Entrainons la Regression Logistique

lr = LogisticRegression(solver="liblinear")
lr.fit(X_train, y_train)

In [None]:
# On récupère la prédiction de la valeur positive
y_pred = lr.predict(X_test)
y_pred

In [None]:
# on peut accéder directement aux probalités de la classe 0 ou 1 de cette façon

y_prob = lr.predict_proba(X_test).round(2)
y_prob

In [None]:
lr.score(X_train, y_train).round(2)

In [None]:
lr.score(X_test, y_test).round(2)

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate).round(2)
print(roc_auc)

In [None]:

plt.figure(figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate,true_positive_rate, color='red',label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
mat = confusion_matrix(y_test, y_pred)
mat = pd.DataFrame(mat)
mat.columns = [f"pred_{i}" for i in mat.columns]
mat.index = [f"test_{i}" for i in mat.index]
mat

## 4.2.2 Using Grid Search

In [None]:
estimator = LogisticRegression(solver = 'liblinear')
params = {  'C': np.logspace(-3, 3, 7) , 
            'penalty':['l1','l2'] }

In [None]:
grid = GridSearchCV(estimator, 
                    params, 
                    cv=5, 
                    n_jobs=-1, 
                    return_train_score=True, 
                    verbose=1)

grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)

In [None]:
res = grid.cv_results_
res

In [None]:
res = pd.DataFrame(res)
res

In [None]:
cols = [i for i in res.columns if "split" not in i]
cols

In [None]:
res = res[cols]
res = res.sort_values("rank_test_score")
res

In [None]:
estimator = LogisticRegression(solver = 'liblinear', penalty="l2", C=1000)
estimator.fit(X_train, y_train)

In [None]:
tr_score = estimator.score(X_train, y_train).round(2)
te_score = estimator.score(X_test, y_test).round(2)

print(f"score train : {tr_score} score test : {te_score} ")

In [None]:
2/5

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate).round(2)
print(roc_auc)

In [None]:
plt.figure(figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate,true_positive_rate, color='red',label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
mat = confusion_matrix(y_test, y_pred)
mat = pd.DataFrame(mat)
mat.columns = [f"pred_{i}" for i in mat.columns]
mat.index = [f"test_{i}" for i in mat.index]
mat

## x.x SVM

### x.x.x with Gridsearch

In [None]:
estimator = LinearSVC()

params = { 'C': np.logspace(-3, 3, 7) }

In [None]:
grid = GridSearchCV(estimator, 
                    params, 
                    cv=5, 
                    n_jobs=-1, 
                    return_train_score=True, 
                    verbose=1)

grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)

In [None]:
res = pd.DataFrame(grid.cv_results_)
cols = [i for i in res.columns if "split" not in i]
res = res[cols]
res.sort_values("rank_test_score")

In [None]:
estimator = LinearSVC(C=100.0)
estimator.fit(X_train, y_train)

In [None]:
tr_score = estimator.score(X_train, y_train).round(2)
te_score = estimator.score(X_test, y_test).round(2)

print(f"score train : {tr_score} score test : {te_score} ")

In [None]:
y_pred = estimator.predict(X_test)
y_pred

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate).round(2)
print(roc_auc)

In [None]:
plt.figure(figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate,true_positive_rate, color='red',label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
mat = confusion_matrix(y_test, y_pred)
mat = pd.DataFrame(mat)
mat.columns = [f"pred_{i}" for i in mat.columns]
mat.index = [f"test_{i}" for i in mat.index]
mat