In [None]:
"""
Here I am going to build my own model to predict the total number of people that survived in the famous Titanic disaster.
All data used to run this are gotten from Kaggle Titanic competition
I used Logistic regression for my model
"""

In [1]:
#Let's start by importing basic Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
#importing data
TrainSet = pd.read_csv('train.csv')
TestSet = pd.read_csv('test.csv')

In [4]:
#Some little checks to determine how many missing data we have through the columns
for col in TrainSet.columns.tolist():
        print('{} has a toltal of {} missing data'.format(col, TrainSet[col].isnull().sum()))
        print('\n')


PassengerId has a toltal of 0 missing data


Survived has a toltal of 0 missing data


Pclass has a toltal of 0 missing data


Name has a toltal of 0 missing data


Sex has a toltal of 0 missing data


Age has a toltal of 177 missing data


SibSp has a toltal of 0 missing data


Parch has a toltal of 0 missing data


Ticket has a toltal of 0 missing data


Fare has a toltal of 0 missing data


Cabin has a toltal of 687 missing data


Embarked has a toltal of 2 missing data




In [6]:
#Also let us view our Dataset to see what is needed and what is not in calculating the survival of the passengers
TrainSet.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
#Let us do the same thing for our TestSet
TestSet.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
"""
Here are a few things we observed:
1. Our training and test data have some missing vairables
2. Not every column is important to determining the survival of the pasanger (at least not for this model).
    Columns such as:
    * Name
    * Ticket
    * Fare (Not important as Pclass already covers this)
    * Cabin
    * Embarked
3. Our given test set does not have the dependent variable (Survived column)
Based on these observations, we will:
- replace the missing values with the median of the column
- remove the not too necessary columns
- take out the dependent variable (survived column) from the training set

"""

'\nHere are a few things we observed:\n1. Our training and test data have some missing vairables\n2. Not every column is important to determining the survival of the pasanger (at least not for this model).\n    Columns such as:\n    * Name\n    * Ticket\n    * Fare (Not important as Pclass already covers this)\n    * Cabin\n    * Embarked\n3. Our given test set does not have the dependent variable (Survived column)\nBased on these observations, we will:\n- replace the missing values with the median of the column\n- remove the not too necessary columns\n- take out the dependent variable (survived column) from the training set\n\n'

In [9]:
#Taking out the dependent variable from the training set
Y_train = TrainSet.iloc[:, 1].values
X_train = TrainSet.iloc[:,[0,2,4,5,6,7]].values

In [10]:
#Missing data from our Training data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0)
imputer = imputer.fit(X_train[:, 3:4])
X_train[:, 3:4] = imputer.transform(X_train[:, 3:4])


In [21]:
#Let us split the training set X_train to create a temporary test set from the training set data.
#This is to help us review our model as dependent column was not given in the original test set
from sklearn.cross_validation import train_test_split
X_train_temp, X_test_temp, Y_train_temp, Y_test_temp = train_test_split(X_train, Y_train, test_size = 0.33, random_state = 0)


In [12]:
# Do not worry about any warning sign when you run the above. It is just telling us that cross_validation will soon be removed
# as a python class and replaced with model_selection so worry not

In [22]:
#Now let's see what our new temp set of data look like
print(X_train_temp[0:10, 0:8])
print('\n')
print(X_test_temp[0:10, 0:8])

[[243 2 'male' 29.0 0 0]
 [345 2 'male' 36.0 0 0]
 [171 1 'male' 61.0 0 0]
 [188 1 'male' 45.0 0 0]
 [801 2 'male' 34.0 0 0]
 [458 1 'female' 28.0 1 0]
 [653 3 'male' 21.0 0 0]
 [452 3 'male' 28.0 1 0]
 [79 2 'male' 0.83 0 2]
 [890 1 'male' 26.0 0 0]]


[[496 3 'male' 28.0 0 0]
 [649 3 'male' 28.0 0 0]
 [279 3 'male' 7.0 4 1]
 [32 1 'female' 28.0 1 0]
 [256 3 'female' 29.0 0 2]
 [299 1 'male' 28.0 0 0]
 [610 1 'female' 40.0 0 0]
 [319 1 'female' 31.0 0 2]
 [485 1 'male' 25.0 1 0]
 [368 3 'female' 28.0 0 0]]


In [23]:
#Encoding categorical data 'Sex' from the temporary data:
#Temporary Training Set
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X_train_temp[:, 2] = labelencoder_X.fit_transform(X_train_temp[:, 2])
onehotencoder_X = OneHotEncoder(categorical_features = [2])
X_train_temp = onehotencoder_X.fit_transform(X_train_temp).toarray()

#Temporary Test Set
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
X_test_temp[:, 2] = labelencoder_X.fit_transform(X_test_temp[:, 2])
onehotencoder_X = OneHotEncoder(categorical_features = [2])
X_test_temp = onehotencoder_X.fit_transform(X_test_temp).toarray()


In [26]:
#Feature Scaling the Train and Test Set
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_temp = scaler.fit_transform(X_train_temp)
X_test_temp = scaler.transform(X_test_temp)


In [27]:
#Now Let's train with our Temp Training Set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train_temp, Y_train_temp)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
#Using our trained model to predict the temporary dependent variable
Y_pred_temp = classifier.predict(X_test_temp)

In [29]:
# Now let us use Matrix of confusion to see how accurate our prediction was
from sklearn.metrics import confusion_matrix
cm_temp = confusion_matrix(Y_test_temp, Y_pred_temp)
print(cm_temp)

[[152  32]
 [ 31  80]]


In [39]:
# From our confution matrix above, we can see that we predicted 152 No (Did not survive) correctly
# and predicted 80 yes correctly. This brings us to a total of 232 correctly predicted out of 295. 
# This means our prediction for the temporary set of Data had 78.64% accuracy
# Now we will visualize this more in a graph
# Now we will combine our training set back to predict the test set. This should have more accuracy as we will be having more data to train

In [40]:
""" Remember we have these below already
Y_train = TrainSet.iloc[:, 1].values
X_train = TrainSet.iloc[:,[0,2,4,5,6,7]].values
"""
#Also, remember we created our temporary data from the training data.
#Now we will also include the original test set
X_test = TestSet.iloc[:,[0,1,3,4,5,6]].values

In [42]:
# Let's take care of the Missing data from the Test data
imputerT = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0)
imputerT = imputerT.fit(X_test[:, 3:4])
X_test[:, 3:4] = imputerT.transform(X_test[:, 3:4])

print(X_test[:10, 0:])

[[892 3 'male' 34.5 0 0]
 [893 3 'female' 47.0 1 0]
 [894 2 'male' 62.0 0 0]
 [895 3 'male' 27.0 0 0]
 [896 3 'female' 22.0 1 1]
 [897 3 'male' 14.0 0 0]
 [898 3 'female' 30.0 0 0]
 [899 2 'male' 26.0 1 1]
 [900 3 'female' 18.0 0 0]
 [901 3 'male' 21.0 2 0]]


In [43]:
# Now let us take care of the categorical data in both train and test set. 
#Remember we have already imported LabelEncoder and OneHotEncoder created their respective objects
#Training Set
X_train[:, 2] = labelencoder_X.fit_transform(X_train[:, 2])
X_train = onehotencoder_X.fit_transform(X_train).toarray()

#Test Set
X_test[:, 2] = labelencoder_X.fit_transform(X_test[:, 2])
X_test = onehotencoder_X.fit_transform(X_test).toarray()


In [44]:
#Feature Scaling the Train and Test Set
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [45]:
# Now Let's train with our main Training Set and predict the test sets
classifier.fit(X_train, Y_train)

Y_pred = classifier.predict(X_test)


In [46]:
# Let us see what our prediction looks like. Note that we dont have the actual values for the test set to calculate confusion matrix
# So we use a simple function to check the total number of 'YES' and 'NO' our model predicted

def summary(predicted):
    predictList = predicted.tolist()
    sum_no = 0
    sum_yes = 0
    for i in range(len(predictList)):
        if predictList[i] == 0:
            sum_no = sum_no + 1
        else:
            sum_yes = sum_yes + 1
    print("Total Predicted \"NO\" (Did Not Survive) = {} \nTotal predicted \"YES\" (Survived) = {}.".format(sum_no, sum_yes))

summary(Y_pred)


Total Predicted "NO" (Did Not Survive) = 254 
Total predicted "YES" (Survived) = 164.
