using XGBoost, RandomForest and Logistic Regression

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import warnings

warnings.filterwarnings('ignore')

# Global variables for family mapping
train_family_survival = {}

# Feature Engineering
def feature_engineering(df, is_train=True):
    # Extract Title from Name
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'].replace(['Mlle', 'Ms'], 'Miss', inplace=True)
    df['Title'].replace(['Mme'], 'Mrs', inplace=True)
    df['Title'].replace(
        ['Dr', 'Rev', 'Col', 'Major', 'Capt', 'Sir', 'Don', 'Dona', 'Jonkheer', 'Lady', 'Countess'],
        'Rare', inplace=True
    )
    title_map = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 0}
    df['TitleRank'] = df['Title'].map(title_map)

    # Family features
    df['FamilyName'] = df['Name'].str.split(',').str[0].astype(str)
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    # Ensure FamilyName mapping is robust
    if not is_train:
        df['FamilyName'] = df['FamilyName'].apply(lambda x: x if x in train_family_survival else 'Unknown')
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    # Age and Fare bands
    df['AgeBand'] = pd.cut(df['Age'], 5, labels=False)
    df['FareBand'] = pd.qcut(df['Fare'], 4, labels=False)

    # Deck extraction
    df['Deck'] = df['Cabin'].str[0]
    df['Deck'] = df['Deck'].fillna('U')

    # Child + First class flag
    df['IsFirstClass'] = (df['Pclass'] == 1).astype(int)
    df['IsChild'] = (df['Age'] < 18).astype(int)

    # Ticket prefix
    df['TicketPrefix'] = df['Ticket'].str.extract(r'([A-Za-z./]+)', expand=False).fillna('None')

    # Family survival rate (for test data, map from training)
    if is_train:
        global train_family_survival
        family_surv = df.groupby('FamilyName')['Survived'].mean()
        train_family_survival = family_surv.to_dict()
        df['FamilySurvivalRate'] = df['FamilyName'].map(train_family_survival)
    else:
        df['FamilySurvivalRate'] = df['FamilyName'].map(train_family_survival)
        # fallback for unseen families
        df['FamilySurvivalRate'].fillna(0.5, inplace=True)

    # Drop unused
    df.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

    # Encode categoricals
    df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Title', 'Deck', 'TicketPrefix'], drop_first=True)

    return df

# Preprocessing
def preprocess(df, is_train=True):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Fare'].fillna(df['Fare'].mean(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    return feature_engineering(df, is_train=is_train)

# Main
def main():
    # Load and preprocess training data
    train_df = pd.read_csv('../data/train.csv')
    train_df = preprocess(train_df, is_train=True)

    y = train_df['Survived']
    X = train_df.drop(columns=['Survived'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Base models
    estimators = [
        ('xgb', XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)),
        ('rf', RandomForestClassifier(random_state=42))
    ]

    # Stacking Classifier
    stacked = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(),
        passthrough=False
    )

    param_grid = {
        'final_estimator__C': [0.1, 1, 10],
        'xgb__max_depth': [3, 5, 7],
        'rf__n_estimators': [100, 200, 300]
    }

    grid = GridSearchCV(stacked, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_

    # Evaluation
    y_pred = best_model.predict(X_test)
    print("Best Parameters:", grid.best_params_)
    print("\nValidation Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    # Load test data
    test_df = pd.read_csv('../data/test.csv')
    test_df['Survived'] = 0  # Placeholder for compatibility
    test_df = preprocess(test_df, is_train=False)
    test_df = test_df.reindex(columns=X.columns, fill_value=0)

    # Predict on test
    predictions = best_model.predict(test_df)
    submission = pd.DataFrame({'PassengerId': pd.read_csv('../data/test.csv')['PassengerId'], 'Survived': predictions})
    submission.to_csv('../data/submission.csv', index=False)
    print("Submission saved to ../data/submission.csv.")

if __name__ == "__main__":
    main()


ValueError: 
All the 135 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/data.py", line 407, in pandas_feature_info
    new_feature_types.append(_pandas_dtype_mapper[dtype.name])
KeyError: 'object'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/sklearn/ensemble/_stacking.py", line 658, in fit
    return super().fit(X, y_encoded, sample_weight)
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/sklearn/ensemble/_stacking.py", line 206, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs)(
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/sklearn/utils/parallel.py", line 65, in __call__
    return super().__call__(iterable_with_config)
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/joblib/parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/joblib/parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/sklearn/utils/parallel.py", line 127, in __call__
    return self.function(*args, **kwargs)
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/sklearn/ensemble/_base.py", line 36, in _fit_single_estimator
    estimator.fit(X, y)
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/sklearn.py", line 1663, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/sklearn.py", line 628, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/sklearn.py", line 1137, in _create_dmatrix
    return QuantileDMatrix(
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/core.py", line 1614, in __init__
    self._init(
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/core.py", line 1678, in _init
    it.reraise()
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/core.py", line 572, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/core.py", line 553, in _handle_exception
    return fn()
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/core.py", line 640, in <lambda>
    return self._handle_exception(lambda: int(self.next(input_data)), 0)
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/data.py", line 1654, in next
    input_data(**self.kwargs)
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/core.py", line 620, in input_data
    new, cat_codes, feature_names, feature_types = _proxy_transform(
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/data.py", line 1707, in _proxy_transform
    df, feature_names, feature_types = _transform_pandas_df(
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/data.py", line 640, in _transform_pandas_df
    feature_names, feature_types = pandas_feature_info(
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/data.py", line 409, in pandas_feature_info
    _invalid_dataframe_dtype(data)
  File "/home/ahmed9402/.local/lib/python3.10/site-packages/xgboost/data.py", line 372, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:FamilyName: object


keras

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
import warnings
warnings.filterwarnings('ignore')

# Feature engineering
def feature_engineering(df):
    df['Name'].fillna('', inplace=True)
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'].replace(['Mlle', 'Ms'], 'Miss', inplace=True)
    df['Title'].replace(['Mme'], 'Mrs', inplace=True)
    df['Title'].replace(['Dr', 'Rev', 'Col', 'Major', 'Capt', 'Sir', 'Don', 'Dona', 'Jonkheer', 'Lady', 'Countess'], 'Rare', inplace=True)
    df['Woman_Child'] = ((df['Sex'] == 'female') | (df['Age'] < 12)).astype(int)
    df['Mother'] = ((df['Sex'] == 'female') & (df['Parch'] > 0) & (df['Age'] > 18)).astype(int)
    df['IsRich'] = ((df['Pclass'] == 1) & (df['Fare'] > df['Fare'].median())).astype(int)

    title_map = {
    'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 0
    }
    df['TitleRank'] = df['Title'].map(title_map)

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['IsChild'] = (df['Age'] < 12).astype(int)
    df['HasFamily'] = ((df['SibSp'] + df['Parch']) > 0).astype(int)

    df['Is1stClass'] = (df['Pclass'] == 1).astype(int)
    df['Is3rdClass'] = (df['Pclass'] == 3).astype(int)

    df['Female_1stClass'] = ((df['Sex'] == 'female') & (df['Pclass'] == 1)).astype(int)
    df['Child_3rdClass'] = ((df['Age'] < 12) & (df['Pclass'] == 3)).astype(int)

    df['FareBand'] = pd.qcut(df['Fare'], 4, labels=False)
    df['AgeBand'] = pd.cut(df['Age'], 5, labels=False)

    df['Deck'] = df['Cabin'].str[0]
    df['Deck'] = df['Deck'].fillna('U')
    df['CabinCount'] = df['Cabin'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)
    df.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)
    df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Title', 'Deck'], drop_first=True)
    return df


# Preprocessing
def preprocess(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Fare'].fillna(df['Fare'].mean(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    return feature_engineering(df)

def main():
    # Load and preprocess training data
    train_df = pd.read_csv('../data/train.csv')
    train_df = preprocess(train_df)

    X_train = pd.get_dummies(train_df.drop(columns=['Survived']))
    y_train = train_df['Survived']

    # Split for validation
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=43)

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    # Build more accurate Keras model
    model = Sequential([
        Dense(8, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        BatchNormalization(),
        Dropout(0.4),
        Dense(16, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        Dense(16, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(16, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping to avoid overfitting
    early_stop = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True)
    model.summary()
    model.fit(
        X_train_scaled, y_train,
        validation_data=(X_val_scaled, y_val),
        epochs=600,
        batch_size=32,
        callbacks=[early_stop],
        verbose=1
    )
    # Evaluate the model
    loss, accuracy = model.evaluate(X_val_scaled, y_val)
    y_pred = (model.predict(X_val_scaled) > 0.5).astype(int).reshape(-1)

    print("Classification Report:\n", classification_report(y_val, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
    # Load and preprocess test data
    test_df = pd.read_csv('../data/test.csv')
    test_df = preprocess(test_df)
    test_X = pd.get_dummies(test_df)
    test_X = test_X.reindex(columns=X_train.columns, fill_value=0)
    test_X_scaled = scaler.transform(test_X)

    # Predict
    predictions = (model.predict(test_X_scaled) > 0.5).astype(int).reshape(-1)
    submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': predictions})
    submission.to_csv('../data/submission.csv', index=False)
    print("Submission saved to ../data/submission.csv")

if __name__ == "__main__":
    main()


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 8)                 296       
                                                                 
 batch_normalization_8 (Bat  (None, 8)                 32        
 chNormalization)                                                
                                                                 
 dropout_8 (Dropout)         (None, 8)                 0         
                                                                 
 dense_11 (Dense)            (None, 16)                144       
                                                                 
 batch_normalization_9 (Bat  (None, 16)                64        
 chNormalization)                                                
                                                                 
 dropout_9 (Dropout)         (None, 16)               

using KerasTuner

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt
import warnings

warnings.filterwarnings('ignore')

# Advanced Feature Engineering
def feature_engineering(df):
    # Extract Titles from Names
    df['Name'].fillna('', inplace=True)
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'].replace(['Mlle', 'Ms'], 'Miss', inplace=True)
    df['Title'].replace(['Mme'], 'Mrs', inplace=True)
    df['Title'].replace(
        ['Dr', 'Rev', 'Col', 'Major', 'Capt', 'Sir', 'Don', 'Dona', 'Jonkheer', 'Lady', 'Countess'],
        'Rare', inplace=True
    )
    title_map = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 0}
    df['TitleRank'] = df['Title'].map(title_map)

    # Family Size
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    # Age and Fare Bands
    df['AgeBand'] = pd.cut(df['Age'], 5, labels=False)
    df['FareBand'] = pd.qcut(df['Fare'], 4, labels=False)

    # Deck from Cabin
    df['Deck'] = df['Cabin'].str[0]
    df['Deck'].fillna('U', inplace=True)

    # Ticket Prefix
    df['TicketPrefix'] = df['Ticket'].str.extract(r'([A-Za-z./]+)', expand=False).fillna('None')

    # Drop Unnecessary Columns
    df.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

    # One-Hot Encoding
    df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Title', 'Deck', 'TicketPrefix'], drop_first=True)

    return df

# Preprocessing
def preprocess(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Fare'].fillna(df['Fare'].mean(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    return feature_engineering(df)

# Model Building Function for Keras Tuner
def build_model(hp):
    model = Sequential()
    model.add(Dense(
        units=hp.Int('units_input', min_value=8, max_value=128, step=8),
        activation='relu',
        input_shape=(X_train_scaled.shape[1],)
    ))
    model.add(Dropout(hp.Float('dropout_input', 0.1, 0.5, step=0.1)))
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(Dense(
            units=hp.Int(f'units_{i}', min_value=8, max_value=128, step=8),
            activation='relu'
        ))
        model.add(Dropout(hp.Float(f'dropout_{i}', 0.1, 0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(
        optimizer=Adam(learning_rate=hp.Choice('learning_rate', [0.01, 0.001, 0.0001])),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

def main():
    # Load and preprocess training data
    train_df = pd.read_csv('../data/train.csv')
    train_df = preprocess(train_df)

    X = train_df.drop(columns=['Survived'])
    y = train_df['Survived']

    # Split for validation
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=43)

    # Standardize features
    scaler = StandardScaler()
    global X_train_scaled
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Hyperparameter Tuning with Keras Tuner
    tuner = kt.Hyperband(
        build_model,
        objective='val_accuracy',
        max_epochs=50,
        factor=3,
        directory='my_dir',
        project_name='titanic_tuning'
    )

    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    tuner.search(
        X_train_scaled, y_train,
        epochs=50,
        validation_data=(X_val_scaled, y_val),
        callbacks=[early_stop],
        verbose=1
    )

    # Retrieve the best model
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    model = tuner.hypermodel.build(best_hps)

    # Train the best model
    history = model.fit(
        X_train_scaled, y_train,
        validation_data=(X_val_scaled, y_val),
        epochs=50,
        batch_size=32,
        callbacks=[early_stop],
        verbose=1
    )

    # Evaluate the model
    loss, accuracy = model.evaluate(X_val_scaled, y_val)
    y_pred = (model.predict(X_val_scaled) > 0.5).astype(int).reshape(-1)

    print("Classification Report:\n", classification_report(y_val, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

    # Load and preprocess test data
    test_df = pd.read_csv('../data/test.csv')
    test_df = preprocess(test_df)
    test_X = test_df.reindex(columns=X_train.columns, fill_value=0)
    test_X_scaled = scaler.transform(test_X)

    # Predict
    predictions = (model.predict(test_X_scaled) > 0.5).astype(int).reshape(-1)
    submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': predictions})
    submission.to_csv('../data/submission.csv', index=False)
    print("Submission saved to ../data/submission.csv")

if __name__ == "__main__":
    main()


Trial 90 Complete [00h 00m 02s]
val_accuracy: 0.7888888716697693

Best val_accuracy So Far: 0.8222222328186035
Total elapsed time: 00h 02m 24s
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.88      0.84        52
           1       0.81      0.68      0.74        38

    accuracy                           0.80        90
   macro avg       0.80      0.78      0.79        90
weighted avg       0.80      0.80      0.80        90

Confusion Matrix:
 [[46  6]
 [12 26]]
Submission saved to ../data/submission.csv


kaggle score 78.9%

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

# Feature engineering
def feature_engineering(df):
    df['Name'].fillna('', inplace=True)
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'].replace(['Mlle', 'Ms'], 'Miss', inplace=True)
    df['Title'].replace(['Mme'], 'Mrs', inplace=True)
    df['Title'].replace(['Dr', 'Rev', 'Col', 'Major', 'Capt', 'Sir', 'Don', 'Dona', 'Jonkheer', 'Lady', 'Countess'], 'Rare', inplace=True)
    df['Woman_Child'] = ((df['Sex'] == 'female') | (df['Age'] < 12)).astype(int)
    df['Mother'] = ((df['Sex'] == 'female') & (df['Parch'] > 0) & (df['Age'] > 18)).astype(int)
    df['IsRich'] = ((df['Pclass'] == 1) & (df['Fare'] > df['Fare'].median())).astype(int)

    title_map = {
    'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 0
    }
    df['TitleRank'] = df['Title'].map(title_map)

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['IsChild'] = (df['Age'] < 12).astype(int)
    df['HasFamily'] = ((df['SibSp'] + df['Parch']) > 0).astype(int)

    df['Is1stClass'] = (df['Pclass'] == 1).astype(int)
    df['Is3rdClass'] = (df['Pclass'] == 3).astype(int)

    df['Female_1stClass'] = ((df['Sex'] == 'female') & (df['Pclass'] == 1)).astype(int)
    df['Child_1stClass'] = ((df['Age'] < 12) & (df['Pclass'] == 1)).astype(int)

    df['FareBand'] = pd.qcut(df['Fare'], 4, labels=False)
    df['AgeBand'] = pd.cut(df['Age'], 5, labels=False)

    df['Deck'] = df['Cabin'].str[0]
    df['Deck'] = df['Deck'].fillna('U')
    df['CabinCount'] = df['Cabin'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)
    df.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)
    df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Title', 'Deck'], drop_first=True)
    return df


# Preprocessing
def preprocess(df):
    # number of missing values in each column
    print("Missing values in each column:\n", df.isnull().sum())
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    count = df[df['Age'].isnull()].count()
    print("Count of missing Age values:", count)
    # if Age is missing and Pclass is 1 and Parch == 2, set Age to 12
    count = df.loc[(df['Age'].isnull()) & (df['Parch'] > 0) & (df['Parch']< 3)].count()
    print("Count of missing Age values with Pclass 1 and Parch 2:", count)
    count = df.loc[(df['Age'].isnull()) & (df['Parch'] == 0)].count()
    print("Count of missing Age values with Pclass 3 and Parch 0:", count)
    df.loc[(df['Age'].isnull()) & (df['Parch'] > 0) & (df['Parch']< 3), 'Age'] = 10
    # if Age is missing and Pclass is 3 and Parch == 0, set Age to 20
    df.loc[(df['Age'].isnull()) & (df['Pclass'] == 3) & (df['Parch'] == 0), 'Age'] = 20
    count = df[df['Age'].isnull()].count()
    print("Count of missing Age values:", count)
    df['Age'].fillna(df['Age'].median(), inplace=True)
    return feature_engineering(df)

def main():
    # Load and preprocess training data
    train_df = pd.read_csv('../data/train.csv')
    train_df = preprocess(train_df)
    # plot a graph of pclass and survived == 1
    plt.figure(figsize=(10, 5))
    survived_counts = train_df[train_df['Survived'] == 1]['Pclass'].value_counts()
    survived_counts.plot(kind='bar')
    plt.title('Pclass vs Survived')
    plt.xlabel('Pclass')
    plt.ylabel('Count')
    plt.show()
    # plot a graph of pclass and survived == 1
    plt.figure(figsize=(10, 5))
    
    median = train_df['Age'].median()
    mean = train_df['Age'].mean()
    print("Median Age:", median)
    print("Mean Age:", mean)
    X_train = pd.get_dummies(train_df.drop(columns=['Survived']))
    y_train = train_df['Survived']
    print("Training data shape:", X_train.shape)
    # Split for validation
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=43,)

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    # Build more accurate Keras model
    model = Sequential([
        Dense(36, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        BatchNormalization(),
        Dropout(0.4),
        Dense(36, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        Dense(18, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(9, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping to avoid overfitting
    early_stop = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True)
    model.summary()
    model.fit(
        X_train_scaled, y_train,
        validation_data=(X_val_scaled, y_val),
        epochs=600,
        batch_size=32,
        callbacks=[early_stop],
        verbose=1
    )
    # Evaluate the model
    loss, accuracy = model.evaluate(X_val_scaled, y_val)
    y_pred = (model.predict(X_val_scaled) > 0.5).astype(int).reshape(-1)

    print("Classification Report:\n", classification_report(y_val, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
    # Load and preprocess test data
    test_df = pd.read_csv('../data/test.csv')
    test_df = preprocess(test_df)
    test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
    test_X = pd.get_dummies(test_df)
    test_X = test_X.reindex(columns=X_train.columns, fill_value=0)
    test_X_scaled = scaler.transform(test_X)

    # Predict
    predictions = (model.predict(test_X_scaled) > 0.5).astype(int).reshape(-1)
    submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': predictions})
    submission.to_csv('../data/submission.csv', index=False)
    print("Submission saved to ../data/submission.csv")

if __name__ == "__main__":
    main()
