In [21]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from scipy.stats import randint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from scipy.stats import randint, uniform

# chargement des données
train_data_path = '/home/charlemagne/workspace/kaggle_challenge_titanic/data/train.csv'
test_data_path = '/home/charlemagne/workspace/kaggle_challenge_titanic/data/test.csv'
submission_file_path = '/home/charlemagne/workspace/kaggle_challenge_titanic/submission/dumb_test_complet.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

def feature_engineering(df):
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Ms', 'Mlle'], 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = df['FamilySize'].apply(lambda x: 1 if x == 1 else 0)
    df['Embarked'] = df['Embarked'].fillna('S')
    return df

train_data = feature_engineering(train_data)
test_data = feature_engineering(test_data)

X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

num_features = X.select_dtypes(include=['int64', 'float64']).columns.drop('PassengerId')
cat_features = X.select_dtypes(include=['object']).columns

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(eval_metric='logloss'))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

predictions = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))
print("Recall:", recall_score(y_test, predictions))
print("F1 Score:", f1_score(y_test, predictions))
print("ROC AUC Score:", roc_auc_score(y_test, predictions))

param_dist = {
    'classifier__n_estimators': randint(100, 1000),
    'classifier__learning_rate': uniform(0.01, 0.2),
    'classifier__max_depth': randint(3, 10),
    'classifier__min_child_weight': randint(1, 10),
    'classifier__subsample': uniform(0.7, 0.3),
    'classifier__colsample_bytree': uniform(0.7, 0.3)
}

random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv=StratifiedKFold(5), scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

print("Meilleurs paramètres:", random_search.best_params_)

if 'PassengerId' in test_data.columns:
    passenger_ids = test_data['PassengerId'].copy()
    final_predictions = random_search.predict(test_data.drop('PassengerId', axis=1))
    submission = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': final_predictions})
    submission.to_csv(submission_file_path, index=False)
else:
    print("Erreur: La colonne 'PassengerId' est introuvable dans test_data.")


Accuracy: 0.8044692737430168
Precision: 0.76
Recall: 0.7702702702702703
F1 Score: 0.7651006711409396
ROC AUC Score: 0.7994208494208493
Meilleurs paramètres: {'classifier__colsample_bytree': 0.9370526621593617, 'classifier__learning_rate': 0.1311919949562023, 'classifier__max_depth': 4, 'classifier__min_child_weight': 7, 'classifier__n_estimators': 140, 'classifier__subsample': 0.9744879026631341}
