# Titanic - Machine Learning from Disaster
## Peihao Chen / Siqi Wang
### 2023-12-29

#### 1. Data preparation

##### 1.1 Load data and understand the data

In [69]:
import pandas as pd
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
print(train_data.shape)
print(test_data.shape)
print(train_data.head())
print(train_data.columns)

(891, 12)
(418, 11)
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500

##### 1.2 Check the data type and missing value

In [70]:
# Check discrete and continuous variables
print(train_data.info())
# Check missing values
print(train_data.isnull().sum())
# too many missing values in Cabin, so drop it
train_data.drop('Cabin', axis=1, inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)
# drop other Nan values
# train_data.dropna(inplace=True)
# test_data.dropna(inplace=True)
# use SimpleImputer to fill missing values(age, fare, Embarked)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(train_data[['Embarked']])
train_data['Embarked'] = imputer.transform(train_data[['Embarked']]).ravel()
test_data['Embarked'] = imputer.transform(test_data[['Embarked']]).ravel()
imputer = SimpleImputer(strategy='median')
imputer.fit(train_data[['Age']])
train_data['Age'] = imputer.transform(train_data[['Age']]).ravel()
test_data['Age'] = imputer.transform(test_data[['Age']]).ravel()
imputer = SimpleImputer(strategy='median')
imputer.fit(train_data[['Fare']])
train_data['Fare'] = imputer.transform(train_data[['Fare']]).ravel()
test_data['Fare'] = imputer.transform(test_data[['Fare']]).ravel()
# check missing values again
print(train_data.isnull().sum())
print(test_data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int6

In [71]:
# drop unnecessary columns
train_data.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
test_data.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
print(train_data.head())

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    male  22.0      1      0   7.2500        S
1         1       1  female  38.0      1      0  71.2833        C
2         1       3  female  26.0      0      0   7.9250        S
3         1       1  female  35.0      1      0  53.1000        S
4         0       3    male  35.0      0      0   8.0500        S


In [72]:
# one-hot encoding
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)
print(train_data.head())

   Survived  Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male  \
0         0       3  22.0      1      0   7.2500       False      True   
1         1       1  38.0      1      0  71.2833        True     False   
2         1       3  26.0      0      0   7.9250        True     False   
3         1       1  35.0      1      0  53.1000        True     False   
4         0       3  35.0      0      0   8.0500       False      True   

   Embarked_C  Embarked_Q  Embarked_S  
0       False       False        True  
1        True       False       False  
2       False       False        True  
3       False       False        True  
4       False       False        True  


##### split data and standardize data

In [73]:
# split train and test data
from sklearn.model_selection import train_test_split
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Standardize data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.model_selection import RandomizedSearchCV
import numpy as np



##### Model 1: Logistic Regression

In [74]:
# Build LR model and RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
import numpy as np
lr = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l1', 'l2']
# randomize C
C = np.random.uniform(0, 1000, 10000)
hyperparameters = dict(C=C, penalty=penalty, solver=solvers)
print(hyperparameters)
clf = RandomizedSearchCV(lr, hyperparameters, cv=5, verbose=0,n_iter=100)
best_model_logistic = clf.fit(X_train, y_train)
print('Best penalty:', best_model_logistic.best_estimator_.get_params()['penalty'])
print('Best C:', best_model_logistic.best_estimator_.get_params()['C'])
print('Best solver:', best_model_logistic.best_estimator_.get_params()['solver'])
print('Accuracy:', best_model_logistic.score(X_test, y_test))

{'C': array([ 18.54292715, 678.92015857, 783.58969683, ..., 492.87556721,
       992.06052452, 913.42901785]), 'penalty': ['l1', 'l2'], 'solver': ['newton-cg', 'lbfgs', 'liblinear']}


Best penalty: l2
Best C: 159.4647209524307
Best solver: liblinear
Accuracy: 0.8100558659217877


150 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/Caesar/Downloads/Kaggle/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/Caesar/Downloads/Kaggle/.venv/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/Caesar/Downloads/Kaggle/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.pena

##### Model 2: Random Forest

In [75]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
n_estimators = [int(x) for x in np.linspace(start=10, stop=1000, num=100)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num=10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
hyperparameters = dict(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth,
                       min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, bootstrap=bootstrap)
print(hyperparameters)
clf = RandomizedSearchCV(rf, hyperparameters, cv=5, verbose=0, n_iter=100)
best_model_rf = clf.fit(X_train, y_train)
print('Best n_estimators:', best_model_rf.best_estimator_.get_params()['n_estimators'])
print('Best max_features:', best_model_rf.best_estimator_.get_params()['max_features'])
print('Best max_depth:', best_model_rf.best_estimator_.get_params()['max_depth'])
print('Best min_samples_split:', best_model_rf.best_estimator_.get_params()['min_samples_split'])
print('Best min_samples_leaf:', best_model_rf.best_estimator_.get_params()['min_samples_leaf'])
print('Best bootstrap:', best_model_rf.best_estimator_.get_params()['bootstrap'])
print('Accuracy:', best_model_rf.score(X_test, y_test))

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520, 530, 540, 550, 560, 570, 580, 590, 600, 610, 620, 630, 640, 650, 660, 670, 680, 690, 700, 710, 720, 730, 740, 750, 760, 770, 780, 790, 800, 810, 820, 830, 840, 850, 860, 870, 880, 890, 900, 910, 920, 930, 940, 950, 960, 970, 980, 990, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


275 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
275 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/Caesar/Downloads/Kaggle/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/Caesar/Downloads/Kaggle/.venv/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/Caesar/Downloads/Kaggle/.venv/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/Caesar/Downloads/Kaggle/.venv/lib/python3.11/site-packages/sklearn/uti

Best n_estimators: 770
Best max_features: sqrt
Best max_depth: 40
Best min_samples_split: 10
Best min_samples_leaf: 1
Best bootstrap: True
Accuracy: 0.8491620111731844


##### Model 3: SVM

In [76]:
from sklearn.svm import SVC
svc = SVC()
C = np.random.uniform(0, 1000, 10000)
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
hyperparameters = dict(C=C, kernel=kernel)
print(hyperparameters)
clf = RandomizedSearchCV(svc, hyperparameters, cv=5, verbose=0)
best_model_svm = clf.fit(X_train, y_train)
print('Best C:', best_model_svm.best_estimator_.get_params()['C'])
print('Best kernel:', best_model_svm.best_estimator_.get_params()['kernel'])
print('Best gamma:', best_model_svm.best_estimator_.get_params()['gamma'])
print('Accuracy:', best_model_svm.score(X_test, y_test))

{'C': array([613.19515282, 346.17012939, 983.53828178, ...,  50.86857419,
       540.74206315,  65.78284796]), 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}


Best C: 168.39341289868293
Best kernel: rbf
Best gamma: scale
Accuracy: 0.7821229050279329


##### Model 4: Naive Bayes

In [77]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
var_smoothing = np.random.uniform(0, 1, 10000)
hyperparameters = dict(var_smoothing=var_smoothing)
print(hyperparameters)
clf = RandomizedSearchCV(nb, hyperparameters, cv=5, verbose=0, n_iter=100)
best_model_nb = clf.fit(X_train, y_train)
print('Best var_smoothing:', best_model_nb.best_estimator_.get_params()['var_smoothing'])
print('Accuracy:', best_model_nb.score(X_test, y_test))

{'var_smoothing': array([0.40593424, 0.93040614, 0.10121666, ..., 0.6293195 , 0.50933576,
       0.93576917])}
Best var_smoothing: 0.04662010366367131
Accuracy: 0.776536312849162


##### Model5: XGBoost

In [78]:
# use XGBoost
from xgboost import XGBClassifier
xgb = XGBClassifier()
n_estimators = [int(x) for x in np.linspace(start=10, stop=1000, num=100)]
learning_rate = [0.01, 0.05, 0.1, 0.2, 0.3]
booster = ['gbtree', 'gblinear', 'dart']
hyperparameters = dict(n_estimators=n_estimators, learning_rate=learning_rate, booster=booster)
print(hyperparameters)
clf = RandomizedSearchCV(xgb, hyperparameters, cv=5, verbose=0, n_iter=100)
best_model_xgb = clf.fit(X_train, y_train)
print('Best n_estimators:', best_model_xgb.best_estimator_.get_params()['n_estimators'])
print('Best max_depth:', best_model_xgb.best_estimator_.get_params()['max_depth'])
print('Best learning_rate:', best_model_xgb.best_estimator_.get_params()['learning_rate'])
print('Best booster:', best_model_xgb.best_estimator_.get_params()['booster'])
print('Accuracy:', best_model_xgb.score(X_test, y_test))

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520, 530, 540, 550, 560, 570, 580, 590, 600, 610, 620, 630, 640, 650, 660, 670, 680, 690, 700, 710, 720, 730, 740, 750, 760, 770, 780, 790, 800, 810, 820, 830, 840, 850, 860, 870, 880, 890, 900, 910, 920, 930, 940, 950, 960, 970, 980, 990, 1000], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3], 'booster': ['gbtree', 'gblinear', 'dart']}


Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters:

Best n_estimators: 470
Best max_depth: None
Best learning_rate: 0.01
Best booster: gbtree
Accuracy: 0.8379888268156425


##### Train the model

In [79]:
# use all data to train model
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(test_data)
# train model with all data and best hyperparameters
best_model_logistic.fit(X_train, y_train)
best_model_rf.fit(X_train, y_train)
best_model_svm.fit(X_train, y_train)
best_model_nb.fit(X_train, y_train)
best_model_xgb.fit(X_train, y_train)
# predict
y_pred_logistic = best_model_logistic.predict(X_test)
y_pred_rf = best_model_rf.predict(X_test)
y_pred_svm = best_model_svm.predict(X_test)
y_pred_nb = best_model_nb.predict(X_test)
y_pred_xgb = best_model_xgb.predict(X_test)



180 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/Caesar/Downloads/Kaggle/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/Caesar/Downloads/Kaggle/.venv/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/Caesar/Downloads/Kaggle/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.pen

In [80]:
# save result
test_data = pd.read_csv('test.csv')
result = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': y_pred_logistic})
result.to_csv('result_logistic.csv', index=False)
result = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': y_pred_rf})
result.to_csv('result_rf.csv', index=False)
result = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': y_pred_svm})
result.to_csv('result_svm.csv', index=False)
result = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': y_pred_nb})
result.to_csv('result_nb.csv', index=False)
result = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': y_pred_xgb})
result.to_csv('result_xgb.csv', index=False)