In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pt
import seaborn as sns
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.stats import uniform, randint
from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [2]:
train_data = pd.read_csv('train.csv')

In [3]:
#test_data = pd.read_csv('test.csv')

In [4]:
#test_data.isna().sum()

In [5]:
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
train_data.shape

(891, 12)

In [9]:
train_data['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

##### Data Preprocessing

In [10]:
train_data.drop(columns = 'Cabin', inplace = True)
#test_data.drop(columns = 'Cabin', inplace = True)

In [11]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

In [12]:
#replacing missing values for age with mean value
train_data['Age'].fillna(train_data['Age'].mean, inplace = True)
#test_data['Age'].fillna(test_data['Age'].mean, inplace = True)


In [13]:
#replacing missing values in Embarked Column with mode
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace = True)

In [14]:
train_data.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [15]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [16]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,1.0,0.0,31.0
max,891.0,1.0,3.0,8.0,6.0,512.3292


In [17]:
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age             object
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked        object
dtype: object

In [18]:
train_data['Sex'] = train_data['Sex'].astype(str)
train_data['Embarked'] = train_data['Embarked'].astype(str)
train_data['Fare'] = train_data['Fare'].astype('int64')
train_data['Age'] = train_data['Fare'].astype('int64')


#test_data['Sex'] = train_data['Sex'].astype(str)
#test_data['Embarked'] = train_data['Embarked'].astype(str)
#test_data['Fare'] = train_data['Fare'].astype('int64')
#test_data['Age'] = train_data['Fare'].astype('int64')

encoder = LabelEncoder()

train_data['Sex'] = encoder.fit_transform(train_data['Sex'])
train_data['Embarked'] = encoder.fit_transform(train_data['Embarked'])
#test_data['Sex'] = encoder.fit_transform(test_data['Sex'])
#test_data['Embarked'] = encoder.fit_transform(test_data['Embarked'])
train_data['Embarked'].value_counts()

2    646
0    168
1     77
Name: Embarked, dtype: int64

In [19]:
train_data.dtypes

PassengerId     int64
Survived        int64
Pclass          int64
Name           object
Sex             int32
Age             int64
SibSp           int64
Parch           int64
Ticket         object
Fare            int64
Embarked        int32
dtype: object

In [20]:
train_data['Sex'].value_counts() #1 = male, 0 = female

1    577
0    314
Name: Sex, dtype: int64

In [21]:
X = train_data.drop(columns = ['PassengerId', 'Ticket', 'Survived', 'Name'])
Y = train_data['Survived']
#X_test = test_data.drop(columns = ['PassengerId', 'Ticket', 'Name'])
#Y_test = test_data['Survived']

##### Standardizing the data

In [22]:
scaler = StandardScaler()
scaler.fit(X)
standardized_data = scaler.transform(X)
#standardized_X_test_data = scaler.transform(X_test)
X = standardized_data
#X_test = standardized_X_test_data
Y = train_data['Survived']
print(X, Y)

[[ 0.82737724  0.73769513 -0.49894756 ... -0.47367361 -0.49894756
   0.58595414]
 [-1.56610693 -1.35557354  0.78940535 ... -0.47367361  0.78940535
  -1.9423032 ]
 [ 0.82737724 -1.35557354 -0.49894756 ... -0.47367361 -0.49894756
   0.58595414]
 ...
 [ 0.82737724 -1.35557354 -0.17685933 ...  2.00893337 -0.17685933
   0.58595414]
 [-1.56610693  0.73769513 -0.03594573 ... -0.47367361 -0.03594573
  -1.9423032 ]
 [ 0.82737724  0.73769513 -0.49894756 ... -0.47367361 -0.49894756
  -0.67817453]] 0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


##### Train test split

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .1, stratify = Y)
print(X.shape, X_train.shape, X_test.shape)

(891, 7) (801, 7) (90, 7)


###### Model selection

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix 
from sklearn import svm

# Define parameter grids for randomized search (coarse search)
logistic_param_grid_coarse = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

xgboost_param_grid_coarse = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}

svm_param_grid_coarse = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf']
}

# RandomizedSearchCV for each model (coarse search)
logistic_random_search_coarse = RandomizedSearchCV(LogisticRegression(), logistic_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)
xgboost_random_search_coarse = RandomizedSearchCV(XGBClassifier(objective='binary:logistic'), xgboost_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)
svm_random_search_coarse = RandomizedSearchCV(svm.SVC(), svm_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)

# Fit models using RandomizedSearchCV (coarse search)
logistic_random_search_coarse.fit(X_train, Y_train)
xgboost_random_search_coarse.fit(X_train, Y_train)
svm_random_search_coarse.fit(X_train, Y_train)

# Get best hyperparameters from RandomizedSearchCV (coarse search)
best_logistic_params_coarse = logistic_random_search_coarse.best_params_
best_xgboost_params_coarse = xgboost_random_search_coarse.best_params_
best_svm_params_coarse = svm_random_search_coarse.best_params_

# Define parameter grids for GridSearchCV (fine search)
logistic_param_grid_fine = {
    'penalty': [best_logistic_params_coarse['penalty']],
    'C': [best_logistic_params_coarse['C'] * i for i in [0.1, 1, 10]],
    'solver': [best_logistic_params_coarse['solver']]
}

xgboost_param_grid_fine = {
    'learning_rate': [best_xgboost_params_coarse['learning_rate'] * i for i in [0.5, 1, 2]],
    'n_estimators': [best_xgboost_params_coarse['n_estimators']],
    'max_depth': [best_xgboost_params_coarse['max_depth']],
    'min_child_weight': [best_xgboost_params_coarse['min_child_weight']],
    'subsample': [best_xgboost_params_coarse['subsample']],
    'colsample_bytree': [best_xgboost_params_coarse['colsample_bytree']]
}

svm_param_grid_fine = {
    'C': [best_svm_params_coarse['C'] * i for i in [0.1, 1, 10]],
    'gamma': [best_svm_params_coarse['gamma'] * i for i in [0.1, 1, 10]],
    'kernel': [best_svm_params_coarse['kernel']]
}

# GridSearchCV for each model (fine search)
logistic_grid_search_fine = GridSearchCV(LogisticRegression(), param_grid=logistic_param_grid_fine, cv=5, n_jobs=-1)
xgboost_grid_search_fine = GridSearchCV(XGBClassifier(objective='binary:logistic'), param_grid=xgboost_param_grid_fine, cv=5, n_jobs=-1)
svm_grid_search_fine = GridSearchCV(svm.SVC(), param_grid=svm_param_grid_fine, cv=5, n_jobs=-1)

# Fit models using GridSearchCV (fine search)
logistic_grid_search_fine.fit(X_train, Y_train)
xgboost_grid_search_fine.fit(X_train, Y_train)
svm_grid_search_fine.fit(X_train, Y_train)

# Print best hyperparameters from GridSearchCV (fine search)
print("Logistic Regression Best Parameters (Fine Search):", logistic_grid_search_fine.best_params_)
print("XGBoost Best Parameters (Fine Search):", xgboost_grid_search_fine.best_params_)
print("SVM Best Parameters (Fine Search):", svm_grid_search_fine.best_params_)

# Compare cross-validated scores of each model
logistic_cv_score_fine = logistic_grid_search_fine.best_score_
xgboost_cv_score_fine = xgboost_grid_search_fine.best_score_
svm_cv_score_fine = svm_grid_search_fine.best_score_

# Select the best model based on cross-validated scores
best_model_fine = None
if logistic_cv_score_fine >= xgboost_cv_score_fine and logistic_cv_score_fine >= svm_cv_score_fine:
    best_model_fine = logistic_grid_search_fine.best_estimator_
elif xgboost_cv_score_fine >= logistic_cv_score_fine and xgboost_cv_score_fine >= svm_cv_score_fine:
    best_model_fine = xgboost_grid_search_fine.best_estimator_
else:
    best_model_fine = svm_grid_search_fine.best_estimator_

# Evaluate the best model on the test set
train_accuracy_fine = best_model_fine.score(X_train, Y_train)
print("Best Model Train Accuracy (Fine Search):", train_accuracy_fine)
test_accuracy_fine = best_model_fine.score(X_test, Y_test)
print("Best Model Test Accuracy (Fine Search):", test_accuracy_fine)






Logistic Regression Best Parameters (Fine Search): {'C': 0.01, 'penalty': 'l2', 'solver': 'saga'}
XGBoost Best Parameters (Fine Search): {'colsample_bytree': 0.5, 'learning_rate': 0.2, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.8}
SVM Best Parameters (Fine Search): {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
Best Model Train Accuracy (Fine Search): 0.8389513108614233
Best Model Test Accuracy (Fine Search): 0.8666666666666667


###### Model Eval

In [25]:
y_pred = best_model_fine.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
roc_auc = roc_auc_score(Y_test, best_model_fine.predict_proba(X_test)[:, 1])
conf_matrix = confusion_matrix(Y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8666666666666667
Precision: 0.8484848484848485
Recall: 0.8
F1 Score: 0.823529411764706
ROC AUC Score: 0.9111688311688312
Confusion Matrix:
[[50  5]
 [ 7 28]]
