In [77]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
import warnings
warnings.filterwarnings("ignore")

train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

print("Training Data Head:")
print(train_df.head())

Training Data Head:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500

In [78]:
print("--- Initial Data Info ---")
train_df.info()

print("\n--- Missing values count ---")
print(train_df.isnull().sum())

--- Initial Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

--- Missing values count ---
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             

In [79]:
# --- ÉTAPE 1 : Sauvegarde du PassengerId (pour résoudre le NameError) ---
# Ceci doit être fait au début du bloc 3, avant de modifier test_df.
test_passenger_ids = test_df['PassengerId'].copy()
# -------------------------------------------------------------------

# Remplissage des valeurs manquantes (comme précédemment)
train_df['Age'].fillna(train_df.groupby(['Pclass', 'Sex'])['Age'].transform('median'), inplace=True)
test_df['Age'].fillna(test_df.groupby(['Pclass', 'Sex'])['Age'].transform('median'), inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

# Ingénierie des fonctionnalités
# 1. Titre (Name to Title)
for df in [train_df, test_df]:
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    df['Title'] = df['Title'].replace(rare_titles, 'Rare')
    df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

# 2. Taille de la Famille (FamilySize) et IsAlone
for df in [train_df, test_df]:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

# 3. Encoded Cabin
for df in [train_df, test_df]:
    df['Cabin_Deck'] = df['Cabin'].str[0].fillna('Missing')

# 4. Encodage et préparation finales
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

train_df = pd.get_dummies(train_df, columns=['Embarked', 'Title', 'Cabin_Deck'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Embarked', 'Title', 'Cabin_Deck'], drop_first=True)

# Définition des colonnes de fonctionnalités:
EXCLUDE_COLS = ['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin']
X_train_cols = [col for col in train_df.columns if col not in EXCLUDE_COLS]

# Alignement des colonnes pour X_test
missing_cols = set(X_train_cols) - set(test_df.columns)
for c in missing_cols:
    test_df[c] = 0
test_df = test_df[X_train_cols]

# Définition des variables d'entraînement et de test
X_train = train_df[X_train_cols]
y_train = train_df['Survived']
X_test = test_df

print("Feature preparation complete with advanced engineering.")
print("Vérification des types de données dans X_train:")
print(X_train.dtypes.value_counts())
print(X_train.head())

Feature preparation complete with advanced engineering.
Vérification des types de données dans X_train:
bool       14
int64       6
float64     2
Name: count, dtype: int64
   Pclass  Sex   Age  SibSp  Parch     Fare  FamilySize  IsAlone  Embarked_Q  \
0       3    0  22.0      1      0   7.2500           2        0       False   
1       1    1  38.0      1      0  71.2833           2        0       False   
2       3    1  26.0      0      0   7.9250           1        1       False   
3       1    1  35.0      1      0  53.1000           2        0       False   
4       3    0  35.0      0      0   8.0500           1        1       False   

   Embarked_S  ...  Title_Mrs  Title_Rare  Cabin_Deck_B  Cabin_Deck_C  \
0        True  ...      False       False         False         False   
1       False  ...       True       False         False          True   
2        True  ...      False       False         False         False   
3        True  ...       True       False         False

In [83]:
# Modèles de base (Base Estimators)
rf = RandomForestClassifier(n_estimators=300, max_depth=7, min_samples_split=8, random_state=42)
gb = GradientBoostingClassifier(n_estimators=300, learning_rate=0.02, max_depth=3, random_state=42)
svc = make_pipeline(StandardScaler(), SVC(probability=True, kernel='rbf', C=1.5, gamma='auto', random_state=42))

# Modèle Final (Final Estimator)
lr = LogisticRegression(max_iter=2000, C=0.5, solver='liblinear')

# Stacking Classifier
estimators = [
    ('rf', rf),
    ('gb', gb),
    ('svc', svc)
]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=lr,
    passthrough=True, 
    cv=5
)

# X_train et y_train ont été définis dans le Bloc 3
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_score = cross_val_score(stack, X_train, y_train, cv=kf, scoring='accuracy').mean()
print(f"5-fold CV accuracy (with advanced features and tuning): {cv_score:.4f}")

# Entraînement du meilleur modèle (StackingClassifier) sur l'intégralité des données d'entraînement
stack.fit(X_train, y_train)

# Nous entraînons également 'gb' ici si l'utilisateur veut le réutiliser
gb.fit(X_train, y_train) 

print("Model training and evaluation complete.")

5-fold CV accuracy (with advanced features and tuning): 0.8373
Model training and evaluation complete.


In [84]:
# Utilisation du StackingClassifier (stack) car il a eu la meilleure performance CV
# Le modèle 'stack' a déjà été entraîné dans le Bloc 4.
test_pred = stack.predict(X_test)
test_pred = test_pred.astype(int)

# Création du fichier de soumission
# test_passenger_ids est utilisé pour le 'PassengerId' (Résolution du NameError)
submission = pd.DataFrame({
    'PassengerId': test_passenger_ids, 
    'Survived': test_pred
})
submission.to_csv('submission_stack.csv', index=False)
print("Submission file 'submission_stack.csv' created using StackingClassifier.")

Submission file 'submission_stack.csv' created using StackingClassifier.
