# Titanic Machine Learning

In [1]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler  

In [2]:
df_train = pd.read_csv('train_titanic.csv')
df_test = pd.read_csv('test_titanic.csv')
bias1 = np.ones(shape=(len(df_test),1))
bias2 = np.ones(shape=(len(df_train),1))
df_train['Bias'] = bias2
df_test['Bias'] = bias1

In [3]:
columns = [df_test.columns[i] for i in range(len(df_test.columns))]
train_features = df_train[columns]
test_features = df_test[columns]
train_target = df_train['Survived'].values
train_target = [float(train_target[i]) for i in range(len(train_target))]

In [4]:
print(train_features.dtypes)

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
Bias           float64
dtype: object


In [5]:
str_columns = []
num_columns = []
for i in range(len(train_features.dtypes)):
    if train_features.dtypes[i]!='object':
        num_columns.append(train_features.columns[i])
    else:
        str_columns.append(train_features.columns[i])
print(str_columns)
print(num_columns)

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Bias']


In [6]:
train_features_str = train_features[str_columns]
train_features_str = train_features_str.fillna('no')
print(train_features_str)

train_features_num = train_features[num_columns]
train_features_num = train_features_num.fillna(0)   
print(train_features_num)

                                                  Name     Sex  \
0                              Braund, Mr. Owen Harris    male   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
2                               Heikkinen, Miss. Laina  female   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   
4                             Allen, Mr. William Henry    male   
..                                                 ...     ...   
886                              Montvila, Rev. Juozas    male   
887                       Graham, Miss. Margaret Edith  female   
888           Johnston, Miss. Catherine Helen "Carrie"  female   
889                              Behr, Mr. Karl Howell    male   
890                                Dooley, Mr. Patrick    male   

               Ticket Cabin Embarked  
0           A/5 21171    no        S  
1            PC 17599   C85        C  
2    STON/O2. 3101282    no        S  
3              113803  C123        S  
4           

In [7]:
le = LabelEncoder()
for i in str_columns:
        if not any(not isinstance(y,str) for y in train_features_str[i].values):
            le.fit(train_features_str[i].values)
            old = le.classes_
            new = le.transform(train_features_str[i].values)
            train_features_str = train_features_str.drop(i, axis=1)
            train_features_str[i] = new

In [8]:
train_features = pd.concat([train_features_str, train_features_num], axis = 1) 
print(train_features)

     Name  Sex  Ticket  Cabin  Embarked  PassengerId  Pclass   Age  SibSp  \
0     108    1     523    147         2            1       3  22.0      1   
1     190    0     596     81         0            2       1  38.0      1   
2     353    0     669    147         2            3       3  26.0      0   
3     272    0      49     55         2            4       1  35.0      1   
4      15    1     472    147         2            5       3  35.0      0   
..    ...  ...     ...    ...       ...          ...     ...   ...    ...   
886   548    1     101    147         2          887       2  27.0      0   
887   303    0      14     30         2          888       1  19.0      0   
888   413    0     675    147         2          889       3   0.0      1   
889    81    1       8     60         0          890       1  26.0      0   
890   220    1     466    147         1          891       3  32.0      0   

     Parch     Fare  Bias  
0        0   7.2500   1.0  
1        0  71.2833

In [9]:
test_features_str = test_features[str_columns]
test_features_str = test_features_str.fillna('no')
print(test_features_str)

test_features_num = test_features[num_columns]
test_features_num = test_features_num.fillna(0)   
print(test_features_num)


le = LabelEncoder()
for i in str_columns:
        if not any(not isinstance(y,str) for y in test_features_str[i].values):
            le.fit(test_features_str[i].values)
            old = le.classes_
            new = le.transform(test_features_str[i].values)
            test_features_str = test_features_str.drop(i, axis=1)
            test_features_str[i] = new
            
test_features = pd.concat([test_features_str, test_features_num], axis = 1) 
print(test_features)

                                             Name     Sex              Ticket  \
0                                Kelly, Mr. James    male              330911   
1                Wilkes, Mrs. James (Ellen Needs)  female              363272   
2                       Myles, Mr. Thomas Francis    male              240276   
3                                Wirz, Mr. Albert    male              315154   
4    Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female             3101298   
..                                            ...     ...                 ...   
413                            Spector, Mr. Woolf    male           A.5. 3236   
414                  Oliva y Ocana, Dona. Fermina  female            PC 17758   
415                  Saether, Mr. Simon Sivertsen    male  SOTON/O.Q. 3101262   
416                           Ware, Mr. Frederick    male              359309   
417                      Peter, Master. Michael J    male                2668   

    Cabin Embarked  
0     

In [12]:
scaler = StandardScaler()
scaler.fit(train_features)
train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)

## Using logisitic regression

In [13]:
LR = LogisticRegression(max_iter=800, C=10).fit(train_features, train_target)
predi = LR.predict(train_features)
comparison = [1 if predi[i]==train_target[i] else 0 for i in range(len(predi))]
print(np.sum(comparison)/len(train_target))

0.8092031425364759


In [15]:
predi = LR.predict_proba(train_features)
train_pred = predi[:,1]
threshold = 0.6
pred_train = [0 if train_pred[i] <= threshold else 1 for i in range(len(train_pred))]
comparison = [1 if pred_train[i]==train_target[i] else 0 for i in range(len(pred_train))]
print(np.sum(comparison)/len(train_target))

0.8159371492704826


In [None]:
predi = LR.predict_proba(test_features)
test_pred = predi[:,1]
threshold = 0.6
pred_test = [0 if test_pred[i] <= threshold else 1 for i in range(len(test_pred))]

In [None]:
ID = np.arange(892,892+418,1)
d = {'PassengerId': ID, 'Survived': pred_test}
pred = pd.DataFrame(data=d)
pred.to_csv('titanic_pred.csv', index=False)

## Using MLP classifier (neural network)

In [16]:
NN = MLPClassifier(solver='adam', activation='logistic', random_state=10, max_iter=600, early_stopping=False)
NN.fit(train_features, train_target)
predi = NN.predict_proba(train_features)
train_pred = predi[:,1]
threshold = 0.6
pred_train = [0 if train_pred[i] <= threshold else 1 for i in range(len(train_pred))]
comparison = [1 if pred_train[i]==train_target[i] else 0 for i in range(len(pred_train))]
print(np.sum(comparison)/len(train_target))

0.813692480359147


In [None]:
predi = LR.predict_proba(test_features)
test_pred = predi[:,1]
threshold = 0.6
pred_test = [0 if test_pred[i] <= threshold else 1 for i in range(len(test_pred))]

In [None]:
ID = np.arange(892,892+418,1)
d = {'PassengerId': ID, 'Survived': pred_test}
pred = pd.DataFrame(data=d)
pred.to_csv('titanic_pred.csv', index=False)

## Using SVM classifier

In [None]:
from sklearn import svm
svm_classifier = svm.SVC(gamma='scale', kernel='poly', probability=True)
svm_classifier.fit(train_features, train_target)
predi = svm_classifier.predict_proba(train_features)
train_pred = predi[:,1]
threshold = 0.6
pred_train = [0 if train_pred[i] <= threshold else 1 for i in range(len(train_pred))]
comparison = [1 if pred_train[i]==train_target[i] else 0 for i in range(len(pred_train))]
print(np.sum(comparison)/len(train_target))

In [None]:
predi = svm_classifier.predict_proba(test_features)
test_pred = predi[:,1]
threshold = 0.6
pred_test = [0 if test_pred[i] <= threshold else 1 for i in range(len(test_pred))]

In [None]:
ID = np.arange(892,892+418,1)
d = {'PassengerId': ID, 'Survived': pred_test}
pred = pd.DataFrame(data=d)
pred.to_csv('titanic_pred.csv', index=False)