# Bibliothèques

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Charger les données

In [78]:
# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [3]:
df = pd.DataFrame(train_data)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Préparation des données

## Gérer les valeurs problématiques

## Données manquantes

A defaut de pouvoir gérer les valeurs manquantes plus finement, on veut, a minima, supprimer les valeurs manquantes, car la plupart des algorithmes de machine learning ne seront pas calculable avec des valeurs manquantes

In [5]:
df = df.dropna(axis=1, how='any')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
dtypes: float64(1), int64(5), object(3)
memory usage: 62.8+ KB


<strong>Pour cette première itération nous adoptons une stratégie très conservatrice qui nous fait perdre beaucoup de données. Pour les prochaines itérations nous adopterons de nouvelles stratégies pour affiner nos résulats.</strong>

## Données aberrantes

Ici il s'agit de détecter et gérer les potentielles valeurs considérées comme des *outliers* (valeurs extrêmes potentiellement erronnées) qui pourraient impacter négativement la performance de notre modèle. Même s'**il existe certaines méthodes automatiques** de détection des outliers, la plupart du temps, il faut les **détecter visuellement par une exploration des données**

Pour cette première itération **nous supposons que nous n'avons pas d'outliers**

## Gérer le compromis biais-variance

Pour simplifier les choses, nous n'allons garder dans cet exemple que les features numériques. Il faudrait encoder les autres features catégorielles pour pouvoir les traiter.

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
dtypes: float64(1), int64(5), object(3)
memory usage: 62.8+ KB


In [8]:
titanic_features = ['PassengerId', 'Pclass', 'SibSp', 'Parch', 'Fare']
X = df[titanic_features]
y = df.Survived

In [9]:
X.head(3)

Unnamed: 0,PassengerId,Pclass,SibSp,Parch,Fare
0,1,3,1,0,7.25
1,2,1,1,0,71.2833
2,3,3,0,0,7.925


In [10]:
X.shape

(891, 5)

In [11]:
y.head(3)

0    0
1    1
2    1
Name: Survived, dtype: int64

In [12]:
y.shape

(891,)

## [La méthode *hold-out*](https://slides.com/dr_rochet/cours_tour-horizon_fondamentaux_ml/live?context=editing#/31/0/3)

In [13]:
## Validation 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) ## Possibilité de split le dataset différemment selon un paramètre


In [14]:
X.shape

(891, 5)

In [15]:
X_train.shape

(712, 5)

In [16]:
from sklearn.tree import DecisionTreeRegressor

In [17]:
# instanciation du modèle
titanic_model = DecisionTreeRegressor(random_state=1)

# entraînement (fit)
titanic_model.fit(X_train, y_train)

#### Maintenant que mon modèle est entraîné il peut renvoyer des prédictions:

In [49]:
# prediction 
titanic_model.predict(X_train.head(1))

array([0.])

In [51]:
y.iloc[0]

0

In [None]:
# Pour la première ligne ma prédiction est juste.

In [54]:
titanic_model.score(X_test, y_test)

-0.4464646464646467

In [19]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1
4,5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.05


.. ou un score

In [20]:
# score
titanic_model.score(X_train, y_train)

1.0

**ATTENTION !**
Le score obtenu sur le jeu d'apprentissage est forcément optimiste car il est obtenu dans une situation favorisant le sur apprentissage! Il est presque toujours souhaitable d'étudier son score sur le jeu de test, pour **estimer la capacité de votre modèle à généraliser ses performances à de nouvelles données**

In [21]:
# obtenir un score global (par défaut il s'agit de l'accuracy)
titanic_model.score(X_test, y_test)

-0.4464646464646467

In [22]:
# prediction 
titanic_model.predict(X_test.head()).mean()

0.4

In [23]:
# Make predictions
predictions = titanic_model.predict(X_test)

## Submission_Test_Data

In [64]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [65]:
submission_test_data = test_data[['PassengerId', 'Pclass', 'SibSp', 'Parch', 'Fare']]

In [66]:
submission_test_data.head()

Unnamed: 0,PassengerId,Pclass,SibSp,Parch,Fare
0,892,3,0,0,7.8292
1,893,3,1,0,7.0
2,894,2,0,0,9.6875
3,895,3,0,0,8.6625
4,896,3,1,1,12.2875


In [60]:
submission_test_data.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket',
       'Embarked'],
      dtype='object')

## Valeurs manquantes

In [69]:
submission_test_data['Fare'] = submission_test_data['Fare'].fillna(value=submission_test_data['Fare'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_test_data['Fare']= submission_test_data['Fare'].fillna(value=submission_test_data['Fare'].mean())


In [71]:
predictions = titanic_model.predict(submission_test_data)

In [79]:
# Prepare submission dataframe
submission = pd.DataFrame({
    'PassengerId': submission_test_data['PassengerId'],
    'Survived': predictions.astype('int64')
})

In [80]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [81]:
submission.shape

(418, 2)

In [77]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 6.7 KB


In [82]:
# Save the submission to a CSV file
submission.to_csv('submissionV5.csv', index=False)

## [La validation croisée](https://slides.com/dr_rochet/cours_tour-horizon_fondamentaux_ml/live?context=editing#/32)

Il existe [plusieurs méthodes de validation croisée](https://scikit-learn.org/stable/modules/cross_validation.html?highlight=cross_validation#cross-validation-evaluating-estimator-performance), nous allons utiliser la plus simple, le [K-fold](https://scikit-learn.org/stable/modules/cross_validation.html?highlight=cross_validation#k-fold) :

In [28]:
from sklearn.model_selection import cross_val_score, cross_val_predict
cross_val_score(titanic_model, X_train, y_train, cv=10)

array([-1.09454545, -0.28571429, -0.47425249, -0.57389163, -0.12043189,
       -0.48173913, -0.67398374, -0.97222222, -0.38475177, -0.75809524])

In [29]:
cross_val_predict(titanic_model, X_test.head(), y_test.head())

array([1., 1., 1., 0., 1.])

## Optionnel: Automatisation des étapes de traitement

In [30]:
from sklearn.pipeline import Pipeline
my_pipeline = Pipeline(steps=[('model', DecisionTreeRegressor(random_state=1))
                             ])
cross_val_score(my_pipeline, X_train, y_train, cv = 10)

array([-1.09454545, -0.28571429, -0.47425249, -0.57389163, -0.12043189,
       -0.48173913, -0.67398374, -0.97222222, -0.38475177, -0.75809524])

## En pratique : on souhaite faire un grid search avec une validation croisée simultanée !

In [31]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    "max_depth": [2,5,10,15],
    "min_samples_split": [0.001,0.005,0.01,0.1],
    "min_samples_leaf": [0.001,0.005,0.01,0.1]
}
gs = GridSearchCV(titanic_model, param_grid, cv=10, refit=True)

In [32]:
gs.fit(X_train,y_train)

In [33]:
gs.best_params_

{'max_depth': 2, 'min_samples_leaf': 0.1, 'min_samples_split': 0.001}

In [34]:
gs.best_score_

0.08805230018253839

In [35]:
gs.best_estimator_

In [36]:
# instanciation v2 du modèle
titanic_model_v2 = DecisionTreeRegressor(max_depth=5, min_samples_leaf=0.001, min_samples_split=0.01, random_state=1)

# entraînement (fit)
titanic_model_v2.fit(X_train, y_train)

In [37]:
# prediction 
titanic_model_v2.predict(X_train.head()).mean()

0.44423076923076915

In [38]:
# Make predictions
predictions = titanic_model_v2.predict(X_test)

In [39]:
# Prepare submission dataframe
submission = pd.DataFrame({
    'PassengerId': X_test['PassengerId'],
    'Survived': predictions
})

In [40]:
submission.head()

Unnamed: 0,PassengerId,Survived
278,279,0.310345
783,784,0.0
635,636,0.285714
440,441,0.764706
630,631,0.459016


In [41]:
submission.shape

(179, 2)

In [42]:
# Transform the 'Survived' column
submission['Survived'] = submission['Survived'].apply(lambda x: 0 if x < 0.24 else 1)

In [43]:
# Save the submission to a CSV file
submission.to_csv('submissionV3.csv', index=False)

# Exemple of submission file

In [44]:
# Replace 'GDPGrowth.csv' with the appropriate path if it's not in the same directory
file_path = 'gender_submission.csv'

# Read the CSV file into a DataFrame
submission_data = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
submission_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [45]:
submission_data.shape

(418, 2)