## Competition Kaggle SpaceShip Titanic
#### Commencons par les imports 

In [196]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Train set shape:", data.shape)
print("Test set shape:", test.shape)
data.describe()

Train set shape: (8693, 14)
Test set shape: (4277, 13)


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


#### Réalisons rapidement la préparation préliminaire de la data

In [182]:
Y = data.Transported
X = data.drop(['Transported'], axis=1)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, Y, train_size=0.8, test_size=0.2, random_state=0)
missing_values = X.isnull().sum()
print(missing_values)

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
dtype: int64


# Roadmap

A la vue de toutes les missing values, il va falloir soit drop des colonnes, et dans tous les cas impute toutes les colonnes restantes. Autre problème, il y a énormément de colonnes catégoricals. Il va donc falloir soit en abandonner, soit les encoder.

Missing Values, je propose d'abandonner les colonnes :
- Name
ET d'impute le reste

Pour les colonnes catégoricals, on va éliminer celles avec trop d'entrées :
- Cabin
et OneHot encode les autres

Enfin on va essayer de faire un travail sur Passenger ID, qui est catégoricals mais qu'on ne peut pas OneHot ENcode puisqu'elle à trop d'entrées. Je propose donc de créer une nouvelle colonne, famille, qui sépare l'ID en deux. En fait j'aimerais que notre modèle trouve un lien sur le fait que plusieurs membres d'une meme famille ou jsp quoi ...


In [183]:
cat_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == "object"]
n_cat = list(map(lambda col: X_train_full[col].nunique(), cat_cols))
d = dict(zip(cat_cols, n_cat))
sorted(d.items(), key=lambda x: x[1])

[('CryoSleep', 2),
 ('VIP', 2),
 ('HomePlanet', 3),
 ('Destination', 3),
 ('Cabin', 5449),
 ('Name', 6787),
 ('PassengerId', 6954)]

In [184]:
X_train_partial = X_train_full.drop(['PassengerId','Name','Cabin'], axis=1)
X_valid_partial = X_valid_full.drop(['PassengerId','Name','Cabin'], axis=1)

X_train_partial.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
4278,Europa,False,55 Cancri e,54.0,False,0.0,559.0,0.0,15238.0,2799.0
5971,Earth,False,TRAPPIST-1e,20.0,False,0.0,20.0,1.0,696.0,0.0
464,Mars,False,TRAPPIST-1e,43.0,False,1821.0,0.0,47.0,29.0,0.0
4475,Earth,False,TRAPPIST-1e,24.0,False,185.0,0.0,476.0,1810.0,53.0
8469,Europa,True,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0


In [185]:
X_train = X_train_partial.copy()
X_train['Sum'] = X_train.iloc[:,-5:].sum(axis=1)
X_train.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1, inplace=True)
X_train.head(10)


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,Sum
4278,Europa,False,55 Cancri e,54.0,False,18596.0
5971,Earth,False,TRAPPIST-1e,20.0,False,717.0
464,Mars,False,TRAPPIST-1e,43.0,False,1897.0
4475,Earth,False,TRAPPIST-1e,24.0,False,2524.0
8469,Europa,True,55 Cancri e,25.0,False,0.0
557,Mars,True,TRAPPIST-1e,48.0,False,0.0
3586,Europa,True,55 Cancri e,34.0,False,0.0
470,Europa,True,55 Cancri e,18.0,False,0.0
4757,Mars,False,TRAPPIST-1e,35.0,False,5916.0
874,Earth,False,TRAPPIST-1e,24.0,False,804.0


In [186]:
cat_cols_partial = [col for col in X_train_partial.columns if X_train_partial[col].dtype == "object"]
num_cols_partial = [col for col in X_train_partial.columns if X_train_partial[col].dtype in ['int64', 'float64']]

In [187]:
cat_cols_ord = ['CryoSleep', 'VIP']
cat_cols_OneHot = ['HomePlanet', 'Destination']

desormais on va passer à l'impute, parce qu'il manque beaucoup de valeurs ...

In [188]:
missing_values2 = X_train_partial.isna().sum()
print(missing_values2)

HomePlanet      156
CryoSleep       170
Destination     149
Age             146
VIP             176
RoomService     151
FoodCourt       148
ShoppingMall    172
Spa             152
VRDeck          146
dtype: int64


In [189]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

cat_transform_ord=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinal',OrdinalEncoder(handle_unknown='error'))
])
cat_transform_onehot=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])
num_imputer = SimpleImputer(strategy='median')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_imputer,num_cols_partial),
        ('ord_cat', cat_transform_ord,['CryoSleep','VIP']),
        ('onehot_cat', cat_transform_onehot,['HomePlanet', 'Destination'])
    ]
)

note pour moi meme : ca m'énerve de faire un pipeline ... il est probable que je ne comprenne pas son interet. Mais j'aime pas balancer ma data dedans et ne pas pouvoir observer les différentes étapes de modification. Je crois que j'aimerais bien pouvoir print à chaque étape pour vérifier que c'est bien que je souhaite faire.

solution : j'ai l'impression qu'a part faire des fit_transform moi meme il n'y a pas vraiment de solution

In [190]:
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error

model = XGBRegressor(n_estimators=10000, learning_rate=0.001)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

pipeline.fit(X_train_partial, y_train)

In [191]:

predictions = pipeline.predict(X_valid_partial)
print('MAE:', f'{mean_absolute_error(y_valid, predictions)*100:.3f}','%')
print('accuracy:', f'{accuracy_score(y_valid, predictions.round())*100:.3f}','%')

MAE: 28.550 %
accuracy: 79.183 %


In [192]:
rounded_predictions = predictions.round()
rounded_predictions = np.where(rounded_predictions == 0, False, True)
print('accuracy:', f'{accuracy_score(y_valid, rounded_predictions)*100:.3f}','%')
print(rounded_predictions[:10])
y_valid.head()

accuracy: 79.183 %
[False  True False False  True False False  True  True False]


3601     True
6057     True
2797    False
7110    False
8579     True
Name: Transported, dtype: bool

In [193]:
# on va ajouer une petite chose qu'on avait prévu, 
# sommer les 5 dernières colonnes en 1 seule.
# En fait c'est nul, parce que je dois le faire APRES l'inputing 
# sinon je perds des infos, or je réalise cette etape DANS mon pipeline. 
# Il faudrait donc que je créer une fonction, que je peux inclure DANS 
# le pipeline.

### Test DATA

On va désormais faire la prédiction pour la submit sur Kaggle !

In [194]:
test_partial = test.drop(['PassengerId','Name','Cabin'], axis=1)

final_preds = pipeline.predict(test_partial)
final_preds = final_preds.round().astype(bool)
print(final_preds[:10])


[ True False  True  True  True  True  True  True  True False]


In [195]:
output = pd.DataFrame({
    'PassengerId': submission_data.PassengerId, 
    'Transported': final_preds
    })
output.to_csv('submission.csv', index=False)