<a href="https://colab.research.google.com/github/BYRic-F/Data_practice/blob/main/4_3_Fb_ML_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [7]:
df = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')
df.head(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [13]:
df = pd.get_dummies(df, columns=['Sex'], drop_first=True)
df.head(5)

Unnamed: 0,Survived,Pclass,Name,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Sex_male
0,0,3,Mr. Owen Harris Braund,22.0,1,0,7.25,True
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,38.0,1,0,71.2833,False
2,1,3,Miss. Laina Heikkinen,26.0,0,0,7.925,False
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,35.0,1,0,53.1,False
4,0,3,Mr. William Henry Allen,35.0,0,0,8.05,True


In [18]:
# Tu vas effectuer une classification supervisée sur la colonne "survived", en utilisant le classifieur decision tree. Tu dois faire apparaitre clairement l'accuracy score du jeu de test.
numerical_columns = ['Pclass', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare','Sex_male']

X = df[numerical_columns]
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y , stratify = y)
model = DecisionTreeClassifier()

model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)

print(f"Accuracy score test = {accuracy_score(y_test, y_pred_test):.4f}")



Accuracy score test = 0.7342


In [28]:
# Avec ce même classifieur, tu vas lancer une GridSearch avec les valeurs :
# tous les entiers de 1 à 50 du paramètre max_depth
# tous les entiers entre 1 et 15 du paramètre min_samples_leaf
# (2, 5, 7, 10, 15, 30) du paramètre min_samples_split

hyperparameters = {'max_depth' : range(1,51),
                   'min_samples_leaf' : (1,16),
                   'min_samples_split' : [2, 5, 7, 10, 15, 30]}

grid = GridSearchCV(DecisionTreeClassifier(), scoring = 'accuracy' , param_grid = hyperparameters) #Scoring='accuracy pour maximiser l'accuracy
grid.fit(X_train,y_train)

In [29]:
# avec cette recherche via GridSearch, quelles sont le meilleures valeurs des hyperparamètres si on cherche à maximiser l'accuracy score ?

print(f"Best score : {grid.best_score_}\n"
        f"Best parameters : {grid.best_params_}" )

Best score : 0.8105263157894737
Best parameters : {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [33]:
# Avec ce même classifieur, tu vas lancer un RandomSearch sur les mêmes valeurs des paramètres en limitant à 200 itérations.

random = RandomizedSearchCV(DecisionTreeClassifier(), scoring = 'accuracy' , param_distributions = hyperparameters,
                           n_iter=200)

random.fit(X_train, y_train)

In [34]:
#Avec cette recherche via RandomSearch, quelles sont le meilleures valeurs des hyperparamètres si on cherche à maximiser l'accuracy score ?
print(f"Best score : {random.best_score_}\n"
        f"Best parameters : {random.best_params_}" )

Best score : 0.8105263157894737
Best parameters : {'min_samples_split': 7, 'min_samples_leaf': 1, 'max_depth': 5}


In [None]:
# Les résultats sont exactement identique

In [53]:
# Entrainement supplementaires Pipeline complet avec grid search
df2 = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')

X = df2[['Pclass', 'Sex', 'Age', 'Fare']]
y = df2['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

# Prétraitement

numeric_features =['Age', 'Fare']
numeric_transformers = Pipeline(steps = [
                                ('imputer', SimpleImputer(strategy = 'median')),
                                  ('scaler', StandardScaler())
                                  ])

categorical_features = ['Sex','Pclass']
categorical_transformers=Pipeline(steps= [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('encoder', OneHotEncoder(drop = 'first', handle_unknown='ignore'))
])

# ColumnsTransformers

preprocessor = ColumnTransformer(transformers = [
    ('num',numeric_transformers, numeric_features),
    ('cat',categorical_transformers, categorical_features)
])



#Pipeline final


pipeline_final = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])


# Hyperparams
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [1, 5, 10],            #Double tiret obligatoire dans les pipelines
    'classifier__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(pipeline_final, param_grid = param_grid, scoring= 'accuracy')

grid_search.fit(X_train, y_train)

print(f"Meilleurs paramètres : {grid_search.best_params_}")
print(f"Meilleur score : {grid_search.best_score_:.4f}")

Meilleurs paramètres : {'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Meilleur score : 0.8406
