# WORK-FLOW ULTIME

## EDA

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=Warning)

In [None]:
train = 'https://bit.ly/titanic-train-set'
test = 'https://bit.ly/titanic-test-set'
df_test = pd.read_csv(test, index_col="PassengerId")
df = pd.read_csv(train, index_col="PassengerId")
df.head(3)

In [None]:
# Séparation des features & la Target
X = df.drop('Survived', axis='columns')
Y = df["Survived"]

In [None]:
# Classement des colonnes dans des listes.
target         =  ["Survived"]
drop           =  ["Ticket"]
passthrough    =  ["Pclass","SibSp", "Parch"]
text           =  ["Name","Cabin"]
num_manquantes =  ["Age","Fare"]
cat_manquantes =  ["Embarked"]
cat            =  ["Sex"]

In [None]:
# Liste contenant toutes les colonnes
all_cols = [
    target,
    drop,
    passthrough,
    text,
    num_manquantes,
    cat_manquantes,
    cat,
]

In [None]:
# Vérification (fautes ortographes, oublie de colonnes etc...)
def check_work(liste_all_listes):
    set_nos_cols = set()
    for liste in liste_all_listes:
        for col in liste:
            if col in set_nos_cols:
                print(f"Warning : La colonne '{col}' est déja présente !")
            set_nos_cols.add(col)         
    set_colonnes_originales = set(df.columns)
    mal_écrites = set_nos_cols - set_colonnes_originales
    col_manquantes = set_colonnes_originales - set_nos_cols
    print(f"Des colonnes sont manquantes :  {col_manquantes}")
    print(f"Des colonnes sont mal écrites : {mal_écrites}")

In [None]:
check_work(all_cols)

## PREPROCESSING

In [None]:
# Importer des outils pour faire le preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer

In [None]:
# Importer les utilitaires sklearn pour faire ça proprement
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

In [None]:
# Gestion des colonnes catégorielles contenent des valeurs manquantes.
cat_manquantes_preprocessing = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(sparse=False, handle_unknown="ignore")
)

In [None]:
def extraire_la_premiere_lettre(serie):   
    return pd.DataFrame(serie.str[0])

In [None]:
# Preprocessing de la colonne Cabin.
preprocess_cabin = make_pipeline(
    FunctionTransformer(extraire_la_premiere_lettre),
    SimpleImputer(strategy='constant', fill_value="MANQUANTE"),
    OneHotEncoder(handle_unknown="ignore")   
)

In [None]:
# Preprocessing Complet.
preprocessing = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore"),  cat),
    (cat_manquantes_preprocessing,            cat_manquantes),
    (SimpleImputer(strategy="median"),        num_manquantes),
    (CountVectorizer(),                       'Name'), 
    (preprocess_cabin,                        'Cabin'),
    ("passthrough",                           passthrough),
    ("drop",                                  drop)      
)

## PipeLine Complète

In [None]:
# Créer une Pipeline
from sklearn.pipeline import Pipeline

In [None]:
# Les Algorithmes de Machine Learning
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.linear_model import RidgeClassifier 
from sklearn.neighbors import KNeighborsClassifier as KNN 

In [None]:
# KNN Modele
knn_pipeline = Pipeline([
    ('Preprocessing', preprocessing),
    ('Knn', KNN()),
])

In [None]:
# Regression Ridgle Modele
ridge_pipeline = Pipeline([
    ('Preprocessing', preprocessing),
    ('Ridge', RidgeClassifier()),
])

In [None]:
# Random Forest Modele
forest_pipeline = Pipeline([
    ('Preprocessing', preprocessing),
    ('RandomForest', RandomForest()),
])

In [None]:
forest_pipeline.fit(X,Y)
predictions_forest = forest_pipeline.predict(X)

In [None]:
def accuracy(predict, vérité):
    return (predict==vérité).mean()
print(f"Accuracy Random Forest : {accuracy(predictions_forest, Y)} %")

### GRID SEARCH : Recherche de la Meilleure Pipeline

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier as KNN 

#### CrossValidation

In [None]:
cross_validation_folds = KFold(n_splits=5, shuffle=True, random_state=777)

#### Dictionnaire Hyper-Paramètres Modèle

In [None]:
knn = {}

# pipeline knn
knn["pipeline"] = Pipeline([
                            ('preprocessing', preprocessing),
                            ('knn', KNN()),
])

# Grille pour la pipeline KNN
knn["hyperparamètres"] = {}
knn["hyperparamètres"]["knn__n_neighbors"]  = [1, 3, 5, 7, 9, 13, 17, 21, 27, 29]
knn["hyperparamètres"]["knn__weights"]      = ["uniform", "distance"]

# gridsearch pour la pipeline KNN
knn["gridsearch"] = GridSearchCV(estimator  = knn["pipeline"],
                                 param_grid = knn["hyperparamètres"],
                                 scoring    = 'balanced_accuracy', 
                                 cv         = cross_validation_folds)

In [None]:
knn["gridsearch"].fit(X, Y);

In [None]:
knn["gridsearch"].best_params_;

In [None]:
knn["gridsearch"].best_score_;

## PipeLine Final

In [None]:
pipeline_final = Pipeline([
    ('preprocessing', preprocessing), 
    ('knn', KNN(n_neighbors= 3, weights="distance"))
])

In [None]:
# Entrainement sur toutes les données
pipeline_final.fit(X, Y);

In [None]:
# Pipeline : Les prédictions finales
predict = pipeline_final.predict(df_test)

In [None]:
predict