In [1]:
import pandas as pd

# chemins vers jeux de données
train_data_path = '/home/charlemagne/workspace/kaggle_challenge_titanic/data/train.csv'
test_data_path = '/home/charlemagne/workspace/kaggle_challenge_titanic/data/test.csv'
submission_file_path = '/home/charlemagne/workspace/kaggle_challenge_titanic/submission/dumb_test_complet.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

# affiche premières lignes des jeux de données 
train_df_head = train_df.head()
test_df_head = test_df.head()

train_df_head, test_df_head

(   PassengerId  Survived  Pclass  \
 0            1         0       3   
 1            2         1       1   
 2            3         1       3   
 3            4         1       1   
 4            5         0       3   
 
                                                 Name     Sex   Age  SibSp  \
 0                            Braund, Mr. Owen Harris    male  22.0      1   
 1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
 2                             Heikkinen, Miss. Laina  female  26.0      0   
 3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
 4                           Allen, Mr. William Henry    male  35.0      0   
 
    Parch            Ticket     Fare Cabin Embarked  
 0      0         A/5 21171   7.2500   NaN        S  
 1      0          PC 17599  71.2833   C85        C  
 2      0  STON/O2. 3101282   7.9250   NaN        S  
 3      0            113803  53.1000  C123        S  
 4      0            373450   8.0500

In [8]:
# import librairies nécessaires 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [9]:
# sélection variables 
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
X_train = train_df[features]
y_train = train_df["Survived"]
X_test = test_df[features]

In [10]:
# prétraitement variables numériques 
numerical_transformer = SimpleImputer(strategy="median")

# prétraitement variables catégorielles
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# paquet des deux types de variable
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ["Age", "SibSp", "Parch", "Fare"]),
        ('cat', categorical_transformer, ["Pclass", "Sex", "Embarked"])
    ])

In [11]:
# définition du modèle
model = RandomForestClassifier(n_estimators=100, random_state=0)

# créer et adapte pipeline de prétraitement et de modélisation
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])
clf.fit(X_train, y_train)

# prétraitement des données de test et prédictions
predictions = clf.predict(X_test)

In [12]:
# préparation fichier de sorti
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': predictions})

# affiche premières lignes du fichier de sorti 
submission_head = submission.head()
submission_head

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1


In [13]:
# sauvegarde DataFrame 'submission' dans un fichier CSV
submission.to_csv(submission_file_path, index=False)