# Phase 1 - Récupérer les données

In [2]:
train_url = 'http://bit.ly/titanic-train-set'
test_url  = 'http://bit.ly/titanic-test-set'

import pandas as pd
df = pd.read_csv(train_url, index_col="PassengerId")

# Pour le train test
colonne_cible = "Survived"

X = df.drop(colonne_cible, axis='columns')
Y = df[colonne_cible]

# Data Cleaning à faire par nos soins

In [9]:
# Les colonnes sur lesquelles on va faire un traitement
colonnes_catégoriques = ['Sex']
colonnes_numériques = []
colonnes_catégoriques_avec_valeurs_manquantes = ['Embarked']
colonnes_numériques_avec_valeurs_manquantes = ['Age', 'Fare']
colonnes_feature_engineering = ['Name' , 'Cabin']  # CountVectorizer / Cabin etc...

# Le reste
drop_colonnes = ['Ticket']  # Colonnes à lacher
passthrough_colonnes = ['Pclass', 'SibSp', 'Parch'] # Colonnes à laisser-passer telles quelles

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import FunctionTransformer

In [11]:
fill_missing_then_one_hot_encoder = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='manquante'),
    OneHotEncoder(handle_unknown='ignore')
)

In [12]:
def extraire_la_première_lettre(serie):
    return pd.DataFrame(serie.str[0])

extraire_lettre_cabine = make_pipeline(
    FunctionTransformer(extraire_la_première_lettre),
    fill_missing_then_one_hot_encoder,
)

In [13]:
data_cleaning = make_column_transformer(
    ( OneHotEncoder(), colonnes_catégoriques ),
    ( fill_missing_then_one_hot_encoder , colonnes_catégoriques_avec_valeurs_manquantes),
    ( SimpleImputer(strategy='mean'), colonnes_numériques_avec_valeurs_manquantes),
    ( CountVectorizer(), 'Name'),
    ( extraire_lettre_cabine, 'Cabin'),
    ( 'drop' , drop_colonnes),
    ( 'passthrough' , passthrough_colonnes)
)

# On aura besoin de comparer avec nos autres pipelines

In [14]:
from sklearn.model_selection import KFold

cross_validation_design = KFold(n_splits=5,
                                shuffle=True,
                                random_state=77)

cross_validation_design

KFold(n_splits=5, random_state=77, shuffle=True)

# AUTOML 

In [15]:
!pip install tpot

Collecting tpot
[?25l  Downloading https://files.pythonhosted.org/packages/18/19/4e61af9cd13340167c7865bd55b29c2605058acb4c0aca438c45db75aa29/TPOT-0.11.6.post1-py3-none-any.whl (86kB)
[K     |███▉                            | 10kB 18.1MB/s eta 0:00:01[K     |███████▋                        | 20kB 6.3MB/s eta 0:00:01[K     |███████████▍                    | 30kB 7.7MB/s eta 0:00:01[K     |███████████████▏                | 40kB 8.3MB/s eta 0:00:01[K     |███████████████████             | 51kB 6.5MB/s eta 0:00:01[K     |██████████████████████▉         | 61kB 7.2MB/s eta 0:00:01[K     |██████████████████████████▋     | 71kB 7.7MB/s eta 0:00:01[K     |██████████████████████████████▍ | 81kB 8.1MB/s eta 0:00:01[K     |████████████████████████████████| 92kB 4.6MB/s 
[?25hCollecting update-checker>=0.16
  Downloading https://files.pythonhosted.org/packages/0c/ba/8dd7fa5f0b1c6a8ac62f8f57f7e794160c1f86f31c6d0fb00f582372a3e4/update_checker-0.18.0-py3-none-any.whl
Collecting sto

In [17]:
from tpot import TPOTClassifier

model = TPOTClassifier(generations=2, population_size=50,
                       cv=cross_validation_design,
                       scoring='accuracy',
                       config_dict='TPOT sparse', 
                       verbosity=2, random_state=777, n_jobs=-1)



In [19]:
# D'abord faut cleaner X

X_clean = data_cleaning.fit_transform(X)

model.fit(X_clean, Y)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=150.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.8316678174628084

Generation 2 - Current best internal CV score: 0.8350197727700708

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.8500000000000001, min_samples_leaf=4, min_samples_split=10, n_estimators=100)


TPOTClassifier(config_dict='TPOT sparse', crossover_rate=0.1,
               cv=KFold(n_splits=5, random_state=77, shuffle=True),
               disable_update_check=False, early_stop=None, generations=2,
               log_file=None, max_eval_time_mins=5, max_time_mins=None,
               memory=None, mutation_rate=0.9, n_jobs=-1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=50,
               random_state=777, scoring='accuracy', subsample=1.0,
               template=None, use_dask=False, verbosity=2, warm_start=False)

In [20]:
# et bim!!!!
model.export('tpot_titanic_meilleure_pipeline.py')

In [21]:

#B est pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.8500000000000001, min_samples_leaf=4, min_samples_split=10, n_estimators=100)