In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_csv("final_train.csv", index_col=[0])
test = pd.read_csv("final_test.csv", index_col=[0])

In [3]:
X = train[["PassengerId", "Solo", "FamilySize", "HomePlanet", "CryoSleep", 
           "CabinDeck", "CabinNum", "CabinSide", "Destination", "Age", 
           "AgeRange", "VIP", "RoomService", "FoodCourt", "ShoppingMall", 
           "Spa", "VRDeck", "Expenses", "Spending", "Name"]]
y = train["Transported"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

In [4]:
# Create a mapping from unique names and id to unique integers
names = pd.concat([X, test], axis=0)['Name'].unique()
name_to_int = {name: i for i, name in enumerate(names)}

# Map the "Name" and "PassengerId" column to integer values
X_train['Name'] = X_train['Name'].map(name_to_int)
X_test['Name'] = X_test['Name'].map(name_to_int)
test['Name'] = test['Name'].map(name_to_int)

In [5]:
from sklearn.preprocessing import LabelEncoder

# Instanciation des objets LabelEncoder
le_homeplanet = LabelEncoder()
le_cabindeck = LabelEncoder()
le_cabinside = LabelEncoder()
le_destination = LabelEncoder()

# Transformation des variables qualitatives en numériques à l'aide de label encoding
X_train['HomePlanet'] = le_homeplanet.fit_transform(X_train['HomePlanet'].astype(str))
X_train['CabinDeck'] = le_cabindeck.fit_transform(X_train['CabinDeck'].astype(str))
X_train['CabinSide'] = le_cabinside.fit_transform(X_train['CabinSide'].astype(str))
X_train['Destination'] = le_destination.fit_transform(X_train['Destination'].astype(str))

# Transformation des données de test
X_test['HomePlanet'] = le_homeplanet.transform(X_test['HomePlanet'].astype(str))
X_test['CabinDeck'] = le_cabindeck.transform(X_test['CabinDeck'].astype(str))
X_test['CabinSide'] = le_cabinside.transform(X_test['CabinSide'].astype(str))
X_test['Destination'] = le_destination.transform(X_test['Destination'].astype(str))

# Transformation des données de test pour submissions
test['HomePlanet'] = le_homeplanet.transform(test['HomePlanet'].astype(str))
test['CabinDeck'] = le_cabindeck.transform(test['CabinDeck'].astype(str))
test['CabinSide'] = le_cabinside.transform(test['CabinSide'].astype(str))
test['Destination'] = le_destination.transform(test['Destination'].astype(str))

In [6]:
X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)

In [7]:
selected_features = ["Solo", "HomePlanet", "CryoSleep", 
                    "CabinDeck", "CabinNum", "CabinSide", 
                    "Destination", "Age", "VIP", "RoomService", 
                    "FoodCourt", "ShoppingMall", "Spa", "VRDeck", 
                    "Spending", "Name"]

# not selected : Spending, AgeRange, FamilySize

In [9]:
import xgboost as xgb

optimized_model = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=0.8, enable_categorical=False, gamma=5, importance_type=None, 
                                    interaction_constraints='', learning_rate=0.02,  max_delta_step=0, max_depth=300, 
                                    min_child_weight=7, n_estimators=803, n_jobs=1, nthread=1, num_parallel_tree=1,
                                    random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True,
                                    tree_method='exact', validate_parameters=1, alpha=0.001)

# Entraînement du modèle sur les données d'entraînement
optimized_model.fit(X_train[selected_features], y_train)

# Utilisation des meilleurs hyperparamètres pour faire des prédictions sur les données de test
y_pred = optimized_model.predict(X_test[selected_features])

# Calcul de l'accuracy du modèle
accuracy = accuracy_score(y_test, y_pred)

# Affichage de l'accuracy du modèle
print("Accuracy XGBoost :", accuracy)

Accuracy XGBoost : 0.8085106382978723


In [None]:
# Prédiction avec le modèle
model_pred = optimized_model.predict(test[selected_features])
# créer un DataFrame à partir des identifiants des passagers et des prédictions
result_model = pd.DataFrame({'PassengerId': test['PassengerId'], 'Transported': model_pred})
# transformer les valeurs booleennes en chaînes de caractères "True" ou "False"
result_model['Transported'] = result_model['Transported'].astype(str)
result_model['Transported'] = result_model['Transported'].replace({'0': 'False', '1': 'True'})
# enregistrer les résultats dans un fichier CSV
result_model.to_csv('submissions.csv', index=False, columns=['PassengerId', 'Transported'])