In [195]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from random import randrange

In [196]:
from sklearn import preprocessing as prep

In [197]:
from sklearn import svm

In [198]:
model = svm.SVC()

In [211]:
def clean(df : pd.DataFrame, encoder: prep.OneHotEncoder):
    """
    Une fonction pour préparer les données au modèle de prédiction.
    Parameters:
    df : le dataframe
    encoder : le modèle qui encodera le dataframe en OneHotVector
    """
    without_NaN = df.dropna()

    #je récupère les cabines
    tmp = list(set(without_NaN['Cabin'].to_list()))

    #j'attribue aléatoirement des cabines 
    df["Cabin"] = df["Cabin"].apply(lambda cabin : (tmp[randrange(len(tmp))] if pd.isna(cabin) else cabin))

    #j'ai jugé que la column embarked n'avait pas d'importance, pareil pour id et le nom
    df = df.drop("Embarked", axis=1)
    df = df.drop("Name", axis=1)
    df = df.drop("Ticket", axis=1)
    df = df.drop("PassengerId", axis=1)

    #interpolate les autres valeurs NaN
    df = df.interpolate(method='pad')

    #je normalize les valeurs numeriques
    df["Age"] = df["Age"]/df["Age"].max()
    df["Fare"] = df["Fare"]/df["Fare"].max()

    #j'encode en OneHotVector
    df_num = df.select_dtypes(exclude='object').drop('Pclass', axis=1)
    df_cat = df.select_dtypes(include='object').join(df['Pclass'])

    print(df_cat.columns, Xcat.columns)

    cat_columns = [f"{col}_{cat}" for i, col in enumerate(df_cat.columns) for cat in encoder.categories_[i]]

    onehot_features = pd.DataFrame(encoder.transform(df_cat), columns=cat_columns)

    df = df_num.join(onehot_features)

    return df


In [212]:
train = pd.read_csv("train.csv")

Y = train['Survived']
train = train.drop('Survived', axis=1)

#Entrainement du modèle de One Hot Encoder
onehot = prep.OneHotEncoder(sparse_output=False, handle_unknown='ignore')
Xnum = train.select_dtypes(exclude='object').drop('Pclass', axis=1)
Xcat = train.select_dtypes(include='object')\
            .join(train['Pclass'])\
            .drop(['Embarked', 'Name', 'Ticket'], axis=1)
onehot.fit(Xcat)

#Nettoyage des données
X = clean(train, onehot)

#Entrainement du modèle
model.fit(X, Y)

Index(['Sex', 'Cabin', 'Pclass'], dtype='object') Index(['Sex', 'Cabin', 'Pclass'], dtype='object')


  df = df.interpolate(method='pad')


In [213]:
test = pd.read_csv("test.csv")

test_id = test["PassengerId"].to_list()

In [206]:
test = clean(test, onehot)

Index(['Sex', 'Cabin', 'Pclass'], dtype='object') Index(['Sex', 'Cabin', 'Pclass'], dtype='object')


  df = df.interpolate(method='pad')


In [208]:
test_Y = model.predict(test)
test_Y

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [209]:
result = pd.DataFrame(np.array([test_id, test_Y]).T, columns=["PassengerId", "Survived"])
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [210]:
result.to_csv("predictions.csv", index=False)