##Import libraries

In [54]:
#libraries for all models
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


#libraries for random forest
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

#libraries for neural network
!pip install scikeras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier

#libraries for SVM
!pip install scikit-learn
from sklearn.svm import SVC



##Load data and preprocess it

In [55]:
#
# Load the dataset
train_data = pd.read_csv('train.csv')

# Feature Engineering
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
train_data['IsAlone'] = np.where(train_data['FamilySize'] > 1, 0, 1)
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Preprocessing Steps
numeric_features = ['Age', 'Fare', 'FamilySize']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Embarked', 'Sex', 'Pclass', 'Title', 'IsAlone']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Prepare the data
X = train_data.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
y = train_data['Survived']

#X = preprocessor.fit_transform(X)


# Random forest

## Create model

In [56]:
# Create a preprocessing and modelling pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])


## Hyperparameter tuning

In [57]:
# Define hyperparameter search space
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_features': ['sqrt'],  # Changed from ['auto', 'sqrt'] to just ['sqrt']
    'classifier__max_depth' : [5, 10],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}
# Hyperparameter tuning using Grid Search with Cross-Validation

cv = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
cv.fit(X, y)

print("Best parameters found: ", cv.best_params_)
print("Best accuracy found: ", cv.best_score_)


Best parameters found:  {'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best accuracy found:  0.8350260498399347


##Cross validate model

In [58]:
best_model = cv.best_estimator_

# Stratified K-Fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
cv_scores = []
for train_idx, test_idx in kfold.split(X, y):
    # Fit the model with the best parameters on the training data
    best_model.fit(X.iloc[train_idx], y.iloc[train_idx])

    # Evaluate the model
    y_pred = best_model.predict(X.iloc[test_idx])
    accuracy = accuracy_score(y.iloc[test_idx], y_pred)
    cv_scores.append(accuracy)

    print(f'Fold {fold_no}')
    print(f'Accuracy: {accuracy}')
    # Print classification report for each fold
    print(f'Fold Classification Report:\n{classification_report(y.iloc[test_idx], y_pred > 0.5)}')
    print('Confusion Matrix:')
    print(confusion_matrix(y.iloc[test_idx], y_pred))
    print('-' * 30)

    fold_no += 1
print('Cross-validated accuracy scores:', cv_scores)
print('Mean CV Accuracy:', np.mean(cv_scores))


Fold 1
Accuracy: 0.8435754189944135
Fold Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.91      0.88       110
           1       0.84      0.74      0.78        69

    accuracy                           0.84       179
   macro avg       0.84      0.82      0.83       179
weighted avg       0.84      0.84      0.84       179

Confusion Matrix:
[[100  10]
 [ 18  51]]
------------------------------
Fold 2
Accuracy: 0.8258426966292135
Fold Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       110
           1       0.79      0.74      0.76        68

    accuracy                           0.83       178
   macro avg       0.82      0.81      0.81       178
weighted avg       0.82      0.83      0.82       178

Confusion Matrix:
[[97 13]
 [18 50]]
------------------------------
Fold 3
Accuracy: 0.8146067415730337
Fold Classification Report:
              

# Neural Network


## Create a neural network model & tune hyperparameters

In [59]:
X_nn = preprocessor.fit_transform(X)

# Define a function to create the model (needed for KerasClassifier)
def neural_network(learning_rate=0.001, dropout_rate=0.2, optimizer = 'adam'):
    model = Sequential([
        Dense(64, activation='relu', input_dim=X_nn.shape[1]),
        Dropout(dropout_rate),
        Dense(32, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap the model so it can be used by scikit-learn
model = KerasClassifier(build_fn=neural_network, verbose=0)
# Define hyperparameter search space
param_grid = {'batch_size': [10, 20, 30], 'epochs': [10, 20], 'optimizer': ['adam', 'rmsprop']}

# Hyperparameter tuning using Grid Search with Cross-Validation
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)
grid_result = grid.fit(X_nn, y)

print("Best parameters found: ", grid.best_params_)
print("Best accuracy found: ", grid.best_score_)


  X, y = self._initialize(X, y)


Best parameters found:  {'batch_size': 20, 'epochs': 20, 'optimizer': 'rmsprop'}
Best accuracy found:  0.8372795179210344


## Cross validate

In [60]:
best_model = grid_result.best_estimator_

# Cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
cv_scores = []
for train_idx, test_idx in kfold.split(X_nn, y):
    # Fit the model with the best parameters on the training data
    best_model.fit(X_nn[train_idx], y.iloc[train_idx])

    # Predict and evaluate on the test set
    y_pred = best_model.predict(X_nn[test_idx])
    accuracy = accuracy_score(y.iloc[test_idx], y_pred > 0.5)
    cv_scores.append(accuracy)
    print(f'Fold {fold_no}')
    print(f'Accuracy: {accuracy}')

    # Print classification report for each fold
    print(f'Fold Classification Report:\n{classification_report(y.iloc[test_idx], y_pred > 0.5)}')
    print('Confusion Matrix:')
    print(confusion_matrix(y.iloc[test_idx], y_pred))
    print('-' * 30)
    fold_no += 1

print('Cross-validated accuracy scores:', cv_scores)
print('Mean CV Accuracy:', np.mean(cv_scores))



  X, y = self._initialize(X, y)


Fold 1
Accuracy: 0.8268156424581006
Fold Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.90      0.86       110
           1       0.82      0.71      0.76        69

    accuracy                           0.83       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.83      0.83      0.82       179

Confusion Matrix:
[[99 11]
 [20 49]]
------------------------------


  X, y = self._initialize(X, y)


Fold 2
Accuracy: 0.8370786516853933
Fold Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.92      0.87       110
           1       0.84      0.71      0.77        68

    accuracy                           0.84       178
   macro avg       0.84      0.81      0.82       178
weighted avg       0.84      0.84      0.83       178

Confusion Matrix:
[[101   9]
 [ 20  48]]
------------------------------


  X, y = self._initialize(X, y)


Fold 3
Accuracy: 0.8258426966292135
Fold Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.92      0.87       110
           1       0.84      0.68      0.75        68

    accuracy                           0.83       178
   macro avg       0.83      0.80      0.81       178
weighted avg       0.83      0.83      0.82       178

Confusion Matrix:
[[101   9]
 [ 22  46]]
------------------------------


  X, y = self._initialize(X, y)


Fold 4
Accuracy: 0.848314606741573
Fold Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.94      0.88       110
           1       0.87      0.71      0.78        68

    accuracy                           0.85       178
   macro avg       0.86      0.82      0.83       178
weighted avg       0.85      0.85      0.84       178

Confusion Matrix:
[[103   7]
 [ 20  48]]
------------------------------


  X, y = self._initialize(X, y)


Fold 5
Accuracy: 0.8539325842696629
Fold Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.94      0.89       109
           1       0.88      0.72      0.79        69

    accuracy                           0.85       178
   macro avg       0.86      0.83      0.84       178
weighted avg       0.86      0.85      0.85       178

Confusion Matrix:
[[102   7]
 [ 19  50]]
------------------------------
Cross-validated accuracy scores: [0.8268156424581006, 0.8370786516853933, 0.8258426966292135, 0.848314606741573, 0.8539325842696629]
Mean CV Accuracy: 0.8383968363567889


# Support Vector Machine

## Create SVM model and tune hyperparameters

In [61]:
# Create a preprocessing and modelling pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', SVC())])

# Hyperparameter tuning using Grid Search with Cross-Validation
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__gamma': [0.001, 0.01, 0.1, 1],
    'classifier__kernel': ['rbf', 'poly', 'sigmoid']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))


Best parameters found:  {'classifier__C': 10, 'classifier__gamma': 0.01, 'classifier__kernel': 'rbf'}
Best cross-validation score: 0.84


##Cross validate SVM

In [62]:
# Stratified K-Fold cross-validation for detailed evaluation
best_model = grid_search.best_estimator_
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
cv_scores = []
for train, test in kfold.split(X, y):
    best_model.fit(X.iloc[train], y.iloc[train])
    y_pred = best_model.predict(X.iloc[test])

    print(f'Fold {fold_no}')
    accuracy = accuracy_score(y.iloc[test], y_pred > 0.5)
    cv_scores.append(accuracy)
    print(f'Accuracy: {accuracy}')

    # Print classification report for each fold
    print('Classification Report:')
    print(classification_report(y.iloc[test], y_pred))
    print('Confusion Matrix:')
    print(confusion_matrix(y.iloc[test], y_pred))


    print('-' * 30)

    fold_no += 1

print('Cross-validated accuracy scores:', cv_scores)
print('Mean CV Accuracy:', np.mean(cv_scores))

Fold 1
Accuracy: 0.8435754189944135
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.89      0.88       110
           1       0.82      0.77      0.79        69

    accuracy                           0.84       179
   macro avg       0.84      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179

Confusion Matrix:
[[98 12]
 [16 53]]
------------------------------
Fold 2
Accuracy: 0.8258426966292135
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       110
           1       0.79      0.74      0.76        68

    accuracy                           0.83       178
   macro avg       0.82      0.81      0.81       178
weighted avg       0.82      0.83      0.82       178

Confusion Matrix:
[[97 13]
 [18 50]]
------------------------------
Fold 3
Accuracy: 0.8314606741573034
Classification Report:
              precision    recall