<a href="https://colab.research.google.com/github/Felizlin94/ml-titanic-analysis/blob/main/%E9%90%B5%E9%81%94%E5%B0%BC%E8%99%9F%E5%AD%98%E6%B4%BB%E9%A0%90%E6%B8%AC%E5%B0%88%E9%A1%8C%E5%AF%A6%E4%BD%9C_4_%E6%A8%A1%E5%9E%8B%E5%84%AA%E5%8C%96%E8%88%87%E6%AF%94%E8%BC%83.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import tensorflow as tf
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import xgboost as xgb

# Load the dataset
df = pd.read_csv('https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv')

# Feature Processing
df['Cabin_Category'] = df['Cabin'].str[0]
df['Family_Size'] = df['SibSp'] + df['Parch'] + 1
df.drop(['Name', 'Ticket', 'PassengerId', 'Cabin', 'SibSp', 'Parch'], axis=1, inplace=True)
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df['Cabin_Category'].fillna('Unknown', inplace=True)
df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Cabin_Category'], dtype=int)

# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=0, max_iter=5000),
    'SVM': SVC(random_state=0),
    'Random Forest': RandomForestClassifier(random_state=0),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'LDA': LinearDiscriminantAnalysis()
}

df_train = df
columns_X = list(set(df.columns) - {'Survived'})
columns_y = ['Survived']
train_X = df_train[columns_X]
train_y = df_train[columns_y]

# Define parameter grids
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear', 'saga']
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    },
    'Random Forest': {
        'n_estimators': [10, 50, 100, 200],
        'max_depth': [None, 10, 20, 30]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    },
    'Naive Bayes': {
        'var_smoothing': np.logspace(-9, 0, 10)
    },
    'LDA': {
        'solver': ['svd', 'lsqr', 'eigen']
    }
}

# Apply GridSearchCV and evaluate
results = {}
for model_name, model in models.items():
    print(f"\nRunning GridSearchCV for {model_name}")
    grid = GridSearchCV(model, param_grids[model_name], cv=5, scoring='accuracy')
    grid.fit(train_X, train_y.values.ravel())
    best_model = grid.best_estimator_
    best_score = grid.best_score_
    best_params = grid.best_params_
    results[model_name] = {'best_score': best_score, 'best_params': best_params}
    print(f"{model_name} Best Score: {best_score:.4f}")
    print(f"Best Parameters: {best_params}")

# TensorFlow Model
tf_model = tf.keras.Sequential([
    tf.keras.layers.Dense(100, activation='relu', input_shape=(train_X.shape[1],)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
tf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
tf_model.fit(train_X, train_y, epochs=100, batch_size=32, verbose=0)
tf_best_score = tf_model.evaluate(train_X, train_y, verbose=0)[1]
results['TensorFlow'] = {'best_score': tf_best_score}
print(f"TensorFlow Best Score: {tf_best_score:.4f}")

# PyTorch Model
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(train_X.shape[1], 100)
        self.fc2 = nn.Linear(100, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

X_tensor = torch.tensor(train_X.values, dtype=torch.float32)
y_tensor = torch.tensor(train_y.values, dtype=torch.float32)
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

pytorch_model = SimpleNN()
criterion = nn.BCELoss()
optimizer = optim.Adam(pytorch_model.parameters(), lr=0.001)

for epoch in range(100):
    for batch_X, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = pytorch_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

pytorch_model.eval()
with torch.no_grad():
    pytorch_best_score = ((pytorch_model(X_tensor).numpy().flatten() > 0.5) == train_y.values.flatten()).mean()
results['PyTorch'] = {'best_score': pytorch_best_score}
print(f"PyTorch Best Score: {pytorch_best_score:.4f}")

# XGBoost Model
dtrain = xgb.DMatrix(train_X, label=train_y)
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}
xgb_model = xgb.XGBClassifier()
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(train_X, train_y.values.ravel())
xgb_best_model = grid_search.best_estimator_
xgb_best_score = grid_search.best_score_
xgb_best_params = grid_search.best_params_
results['XGBoost'] = {'best_score': xgb_best_score, 'best_params': xgb_best_params}
print(f"XGBoost Best Score: {xgb_best_score:.4f}")
print(f"Best Parameters: {xgb_best_params}")

# Print all results
df_results = pd.DataFrame(results).T.reset_index()
df_results.columns = ['Model', 'Best Score', 'Best Parameters']
df_results = df_results.sort_values(by='Best Score', ascending=False).reset_index(drop=True)
df_results['Rank'] = df_results.index + 1
print("\nResults Summary:")
print(df_results)



Running GridSearchCV for Logistic Regression
Logistic Regression Best Score: 0.7980
Best Parameters: {'C': 10, 'solver': 'liblinear'}

Running GridSearchCV for SVM
