In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from sklearn.model_selection import train_test_split

import re


In [None]:

# Adjust the path to where your file is saved in Google Drive
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/train.csv')

In [None]:
# Examine the shape of the DataFrame
print("Shape of the DataFrame:", df.shape)

# Check data types and identify potential type conversions
print("\nData Types:\n", df.dtypes)

# Summarize descriptive statistics for numerical features
print("\nDescriptive Statistics:\n", df.describe())

# Analyze the distribution of the target variable ('Survived')
print("\nTarget Variable Distribution:\n", df['Survived'].value_counts())

# Identify missing values
print("\nMissing Values:\n", df.isnull().sum())

# Explore the relationship between features and the target variable
print("\nRelationship between Pclass and Survived:\n", pd.crosstab(df['Pclass'], df['Survived']))
print("\nRelationship between Sex and Survived:\n", pd.crosstab(df['Sex'], df['Survived']))

# Investigate unique values in categorical columns
print("\nUnique values in 'Embarked':\n", df['Embarked'].value_counts())
print("\nUnique values in 'Sex':\n", df['Sex'].value_counts())
print("\nUnique values in 'Pclass':\n", df['Pclass'].value_counts())

Shape of the DataFrame: (891, 12)

Data Types:
 PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

Descriptive Statistics:
        PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

     

In [None]:
df.head(100)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,0,3,"Shorney, Mr. Charles Joseph",male,,0,0,374910,8.0500,,S
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
97,98,1,1,"Greenfield, Mr. William Bertram",male,23.0,0,1,PC 17759,63.3583,D10 D12,C
98,99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34.0,0,1,231919,23.0000,,S


In [None]:

# Impute missing 'Age' values using the median age for each passenger class
df_cleaned = df.copy()
df_cleaned['Age'] = df_cleaned.groupby('Pclass')['Age'].transform(lambda x: x.fillna(x.median()))

# Impute missing 'Embarked' values with the most frequent port
most_frequent_embarked = df_cleaned['Embarked'].mode()[0]
df_cleaned['Embarked'] = df_cleaned['Embarked'].fillna(most_frequent_embarked)

# Create 'Has_Cabin' feature and drop 'Cabin'
df_cleaned['Has_Cabin'] = df_cleaned['Cabin'].notna().astype(int)
df_cleaned = df_cleaned.drop('Cabin', axis=1)

# Convert 'Sex' to numerical values
df_cleaned['Sex'] = df_cleaned['Sex'].map({'male': 0, 'female': 1})

# One-hot encode 'Embarked'
embarked_dummies = pd.get_dummies(df_cleaned['Embarked'], prefix='Embarked')
df_cleaned = pd.concat([df_cleaned, embarked_dummies], axis=1)
df_cleaned = df_cleaned.drop('Embarked', axis=1)

# Remove duplicate rows
df_cleaned = df_cleaned.drop_duplicates()

# Display the cleaned DataFrame
display(df_cleaned.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Has_Cabin,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,0,False,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,1,True,False,False
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,0,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,1,False,False,True
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,0,False,False,True


In [None]:


# Feature Engineering
def process_data(df):
    # Family Size
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    # IsAlone
    df['IsAlone'] = 1
    df.loc[df['FamilySize'] > 1, 'IsAlone'] = 0

    # Title Extraction
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    rare_titles = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    df['Title'] = df['Title'].replace(rare_titles, 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    # Fare per Person
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']

    # Age Group
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 20, 40, 120], labels=['Child', 'Teenager', 'YoungAdult', 'Adult'])

    return df

df_cleaned = process_data(df_cleaned)

# Define features (X) and target (y)
X = df_cleaned.drop(['Survived', 'PassengerId', 'Name', 'Ticket'], axis=1)
y = df_cleaned['Survived']

# Split data into training and testing sets1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1234, stratify=y)

In [None]:

# Identify numerical and categorical features
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize', 'FarePerPerson']
categorical_features = ['Pclass', 'Sex', 'Has_Cabin', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title', 'AgeGroup']

# Scale numerical features
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(X_train[numerical_features])
X_test_numerical = scaler.transform(X_test[numerical_features])

# Encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_categorical = encoder.fit_transform(X_train[categorical_features])
X_test_categorical = encoder.transform(X_test[categorical_features])

# Concatenate scaled numerical and encoded categorical features
X_train_prepared = np.concatenate([X_train_numerical, X_train_categorical], axis=1)
X_test_prepared = np.concatenate([X_test_numerical, X_test_categorical], axis=1)

In [None]:

# Instantiate the models
logreg_model = LogisticRegression()
rf_model = RandomForestClassifier()
xgb_model = XGBClassifier()

# Train the models
logreg_model.fit(X_train_prepared, y_train)
rf_model.fit(X_train_prepared, y_train)
xgb_model.fit(X_train_prepared, y_train)

In [None]:


# Predict on the test set
y_pred_logreg = logreg_model.predict(X_test_prepared)
y_pred_rf = rf_model.predict(X_test_prepared)
y_pred_xgb = xgb_model.predict(X_test_prepared)

# Calculate evaluation metrics
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    return accuracy, precision, recall, f1, auc

logreg_metrics = evaluate_model(y_test, y_pred_logreg)
rf_metrics = evaluate_model(y_test, y_pred_rf)
xgb_metrics = evaluate_model(y_test, y_pred_xgb)

# Create a summary table
model_names = ['Logistic Regression', 'Random Forest', 'XGBoost']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score', 'AUC']
results = pd.DataFrame(
    data=[logreg_metrics, rf_metrics, xgb_metrics],
    index=model_names,
    columns=metrics
)

display(results)

# Analyze and identify the best model
print("Model Performance Analysis:")
print(results)

best_model = results['F1-score'].idxmax()

print(f"\nBest Model (based on F1-score): {best_model}")

Unnamed: 0,Accuracy,Precision,Recall,F1-score,AUC
Logistic Regression,0.855556,0.823529,0.8,0.811594,0.845455
Random Forest,0.811111,0.764706,0.742857,0.753623,0.798701
XGBoost,0.8,0.742857,0.742857,0.742857,0.78961


Model Performance Analysis:
                     Accuracy  Precision    Recall  F1-score       AUC
Logistic Regression  0.855556   0.823529  0.800000  0.811594  0.845455
Random Forest        0.811111   0.764706  0.742857  0.753623  0.798701
XGBoost              0.800000   0.742857  0.742857  0.742857  0.789610

Best Model (based on F1-score): Logistic Regression


In [None]:

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']  # 'newton-cg' is not compatible with 'l1' penalty
}

# Instantiate GridSearchCV
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='f1')

# Fit the grid search to the data
grid_search.fit(X_train_prepared, y_train)

# Get the best hyperparameters and best estimator
best_params = grid_search.best_params_
best_logreg_model = grid_search.best_estimator_

print(f"Best hyperparameters: {best_params}")

# Train a new Logistic Regression model with best hyperparameters
optimized_logreg_model = LogisticRegression(**best_params, max_iter=1000)
optimized_logreg_model.fit(X_train_prepared, y_train)



Best hyperparameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}


In [None]:


# Predict on the test set using the optimized model
y_pred_optimized_logreg = optimized_logreg_model.predict(X_test_prepared)

# Calculate evaluation metrics for the optimized model
optimized_logreg_metrics = evaluate_model(y_test, y_pred_optimized_logreg)

# Add the optimized model's results to the DataFrame
results.loc['Optimized Logistic Regression'] = optimized_logreg_metrics

# Display the updated results DataFrame
display(results)

# Identify the best model based on F1-score
best_model = results['F1-score'].idxmax()
print(f"\nBest Model (based on F1-score): {best_model}")

Unnamed: 0,Accuracy,Precision,Recall,F1-score,AUC
Logistic Regression,0.855556,0.823529,0.8,0.811594,0.845455
Random Forest,0.811111,0.764706,0.742857,0.753623,0.798701
XGBoost,0.8,0.742857,0.742857,0.742857,0.78961
Optimized Logistic Regression,0.855556,0.823529,0.8,0.811594,0.845455



Best Model (based on F1-score): Logistic Regression


In [None]:

# Instantiate the SVM model
svm_model = SVC(probability=True) # probability=True to get AUC

# Build a simple neural network model
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_prepared.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the neural network
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the models
svm_model.fit(X_train_prepared, y_train)
nn_model.fit(X_train_prepared, y_train, epochs=50, batch_size=32, verbose=0) # Train NN

print("SVM and Neural Network models trained.")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


SVM and Neural Network models trained.


In [None]:
# Predict on the test set using the new models
y_pred_svm = svm_model.predict(X_test_prepared)
y_pred_nn_prob = nn_model.predict(X_test_prepared)
y_pred_nn = (y_pred_nn_prob > 0.5).astype("int32")

# Calculate evaluation metrics for the new models
svm_metrics = evaluate_model(y_test, y_pred_svm)
nn_metrics = evaluate_model(y_test, y_pred_nn)

# Add the new models' results to the DataFrame
results.loc['Support Vector Machine'] = svm_metrics
results.loc['Neural Network'] = nn_metrics

# Display the updated results DataFrame
display(results)

# Analyze and identify the best model
print("Model Performance Analysis (Including SVM and Neural Network):")
print(results)

best_model = results['F1-score'].idxmax()

print(f"\nBest Model (based on F1-score): {best_model}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step


Unnamed: 0,Accuracy,Precision,Recall,F1-score,AUC
Logistic Regression,0.855556,0.823529,0.8,0.811594,0.845455
Random Forest,0.811111,0.764706,0.742857,0.753623,0.798701
XGBoost,0.8,0.742857,0.742857,0.742857,0.78961
Optimized Logistic Regression,0.855556,0.823529,0.8,0.811594,0.845455
Support Vector Machine,0.855556,0.84375,0.771429,0.80597,0.84026
Neural Network,0.811111,0.764706,0.742857,0.753623,0.798701


Model Performance Analysis (Including SVM and Neural Network):
                               Accuracy  Precision    Recall  F1-score  \
Logistic Regression            0.855556   0.823529  0.800000  0.811594   
Random Forest                  0.811111   0.764706  0.742857  0.753623   
XGBoost                        0.800000   0.742857  0.742857  0.742857   
Optimized Logistic Regression  0.855556   0.823529  0.800000  0.811594   
Support Vector Machine         0.855556   0.843750  0.771429  0.805970   
Neural Network                 0.811111   0.764706  0.742857  0.753623   

                                    AUC  
Logistic Regression            0.845455  
Random Forest                  0.798701  
XGBoost                        0.789610  
Optimized Logistic Regression  0.845455  
Support Vector Machine         0.840260  
Neural Network                 0.798701  

Best Model (based on F1-score): Logistic Regression


In [None]:
# prompt: let's optimize the SVM model to see if the metrics can be improved

# Define the parameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.1, 1],
    'degree': [2, 3, 4]  # Only relevant for 'poly' kernel
}

# Instantiate GridSearchCV for SVM
grid_search_svm = GridSearchCV(SVC(probability=True), param_grid_svm, cv=5, scoring='f1', verbose=1, n_jobs=-1)

# Fit the grid search to the data
grid_search_svm.fit(X_train_prepared, y_train)

# Get the best hyperparameters and best estimator for SVM
best_params_svm = grid_search_svm.best_params_
best_svm_model = grid_search_svm.best_estimator_

print(f"Best hyperparameters for SVM: {best_params_svm}")

# Predict on the test set using the optimized SVM model
y_pred_optimized_svm = best_svm_model.predict(X_test_prepared)

# Calculate evaluation metrics for the optimized SVM model
optimized_svm_metrics = evaluate_model(y_test, y_pred_optimized_svm)

# Add the optimized SVM model's results to the DataFrame
results.loc['Optimized Support Vector Machine'] = optimized_svm_metrics

# Display the updated results DataFrame
display(results)

# Identify the best model based on F1-score
best_model = results['F1-score'].idxmax()
print(f"\nBest Model (based on F1-score): {best_model}")

Fitting 5 folds for each of 144 candidates, totalling 720 fits


In [None]:

# Adjust the path to where your test file is saved in Google Drive
df_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/test.csv')

# Apply the same processing steps to the test data
df_test_cleaned = df_test.copy()
df_test_cleaned['Age'] = df_test_cleaned.groupby('Pclass')['Age'].transform(lambda x: x.fillna(x.median()))
# Impute missing 'Fare' values with the median (only present in test set)
df_test_cleaned['Fare'] = df_test_cleaned['Fare'].fillna(df_test_cleaned['Fare'].median())

df_test_cleaned['Has_Cabin'] = df_test_cleaned['Cabin'].notna().astype(int)
df_test_cleaned = df_test_cleaned.drop('Cabin', axis=1)
df_test_cleaned['Sex'] = df_test_cleaned['Sex'].map({'male': 0, 'female': 1})

embarked_dummies_test = pd.get_dummies(df_test_cleaned['Embarked'], prefix='Embarked')
df_test_cleaned = pd.concat([df_test_cleaned, embarked_dummies_test], axis=1)
df_test_cleaned = df_test_cleaned.drop('Embarked', axis=1)

# Assuming 'process_data' function was defined and used for the training data
# Apply the same processing steps to the test data
# Ensure the process_data function is defined and available
def process_data(df):
    # Family Size
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    # IsAlone
    df['IsAlone'] = 1
    df.loc[df['FamilySize'] > 1, 'IsAlone'] = 0

    # Title Extraction
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    rare_titles = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    df['Title'] = df['Title'].replace(rare_titles, 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    # Fare per Person
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']

    # Age Group
    # Handle potential NaNs in 'Age' before creating AgeGroup
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 20, 40, 120], labels=['Child', 'Teenager', 'YoungAdult', 'Adult'], right=False)
    # Fill NaN AgeGroup after cutting
    df['AgeGroup'] = df['AgeGroup'].cat.add_categories('Unknown').fillna('Unknown')

    return df

df_test_cleaned = process_data(df_test_cleaned)

# Prepare the test data for prediction
# Ensure the test data has the same columns as the training data after processing
# We need to handle the case where a category in the test set was not present in the training set's categorical features

# Re-use the original numerical and categorical feature lists defined during training
# These were defined before the scaling and encoding steps for X_train and X_test
# Find these definitions in your previous cells and copy them here if they aren't global
# Assuming they were:
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize', 'FarePerPerson']
categorical_features = ['Pclass', 'Sex', 'Has_Cabin', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title', 'AgeGroup']


# Align columns - add dummy columns to test if they exist in train but not test
# First, ensure all columns from the original X are present in df_test_cleaned
train_cols = X.columns.tolist()
test_cols = df_test_cleaned.columns.tolist()

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    df_test_cleaned[c] = 0

# Ensure the order of columns is the same as in X
X_test_final = df_test_cleaned[train_cols]

# Scale numerical features using the SAME scaler fitted on the training data
X_test_numerical_final = scaler.transform(X_test_final[numerical_features])

# Encode categorical features using the SAME encoder fitted on the training data
# Need to handle potential new categories in the test set gracefully.
# The 'handle_unknown='ignore'' setting on the encoder helps with this.
# Ensure the categorical columns in X_test_final are of object or category dtype for the encoder.
for col in categorical_features:
    if col in X_test_final.columns and X_test_final[col].dtype != 'object' and X_test_final[col].dtype.name != 'category':
        X_test_final[col] = X_test_final[col].astype('object')


X_test_categorical_final = encoder.transform(X_test_final[categorical_features])

# Concatenate scaled numerical and encoded categorical features
X_test_prepared_final = np.concatenate([X_test_numerical_final, X_test_categorical_final], axis=1)


# Use the optimized logistic regression model trained previously
final_model = optimized_logreg_model

# Predict survival on the test dataset
predictions = final_model.predict(X_test_prepared_final)

# Create the submission DataFrame with PassengerId and Survived columns
submission_df = pd.DataFrame({'PassengerId': df_test_cleaned['PassengerId'], 'Survived': predictions})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('/content/drive/My Drive/Colab Notebooks/submission_logistic_regression.csv', index=False)

print("Submission file created successfully: submission_logistic_regression.csv")
print(submission_df.head())

In [None]:
outcome = pd.read_csv('/content/drive/My Drive/Colab Notebooks/submission_logistic_regression.csv')
outcome.head(10)