In [6]:
import pandas
import numpy as np

# Import the linear regression class
from sklearn.linear_model import LinearRegression
# Sklearn also has a helper that makes it easy to do cross validation
from sklearn.cross_validation import KFold

In [7]:
titanic = pandas.read_csv('train.csv')

In [8]:
print(titanic.head(5))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   35      1   
4                           Allen, Mr. William Henry    male   35      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [9]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Using the isnull() function above, we discovered that the data is not clean. There are 177 people that do not have an age. The Cabin also has multiple null values. However, we do not plan to do anything with that data at the moment. There is also no good way to "clean" that data. Most cleaning functions involve replacing null with the median values. However, there is no median value for cabin because it is a string. 

In [10]:
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
print titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


Currently sex is a string 'male' or 'female'. In order to do computations with this column of data, we need to translate this string into an integer value. This means that each value under sex will be changed to 0 if it says 'male' or 1 if it says 'female'

In [11]:
# Find all the unique genders -- the column appears to contain only male and female.
print(titanic["Sex"].unique())

# Replace all the occurences of female with the number 1.
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1

['male' 'female']


Another column we are interested in is the Embarked column. However, Embarked is currently a string with S, C, Q for different embarkment points. We need to translate these options into integers in order to be used. S = 0, C = 1, Q = 3, and nan = 0. This is done below. 

In [12]:
# Find all the unique values for "Embarked".
print(titanic["Embarked"].unique())

titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2
titanic.loc[titanic["Embarked"].isnull(), "Embarked"] = 0

print titanic.head(5)

['S' 'C' 'Q' nan]
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name Sex  Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris   0   22      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...   1   38      1      0   
2                             Heikkinen, Miss. Laina   1   26      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)   1   35      1      0   
4                           Allen, Mr. William Henry   0   35      0      0   

             Ticket     Fare Cabin Embarked  
0         A/5 21171   7.2500   NaN        0  
1          PC 17599  71.2833   C85        1  
2  STON/O2. 3101282   7.9250   NaN        0  
3            113803  53.1000  C123        0  
4            373450   8.0500   NaN        0  


Next we will try to use the predictors to fit a model to the train data set. Each one of these predictors will be an x in the equation a = m1*x2 + m2*x2 + m3*x3 + ...
These predictors will be used to fit an equation to the data, by that I mean they will approximate values for m1, m2, m3, ... by splitting the data into 3 pieces. Then two of the pieces will be combined into the train set and the remaining one will become the test set. First, it will start off by approximating an linear equation to the two sets that are the training set. Then, it will measure the accuracy of m. Then it will swap so that a new combination of two of the pieces of data will become the train and the rest will become the test. This will be done until no new combinations can be made. Each time, the m value will change depending on the accuracy of the previous set.

In the following cell, we will split the data into three, combine all of them to use two as the train and one as the test at one time, fit a linear equation to the data, and then predict the target. 

In [13]:
# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm class
alg = LinearRegression()
# Generate cross validation folds for the titanic dataset.  It return the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    #training the algorithm 
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (titanic[predictors].iloc[train,:])
    # The target we're using to train the algorithm.
    train_target = titanic["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    #fitting the alogrithm 
    alg.fit(train_predictors, train_target)
    #now make predictions
    # We can now make predictions on the test fold
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)

In the following cell, we now have all of our predictions for each target (each person on the boat). These predictions are the result of equations so they're not 1s and 0s for live and die. Therefore, we can generalize and say if their value is greater than 0.5, they will live and will be assigned a 1. Otherwise, if the value is less than 0.5, then they will die and will be assigned a 0. 

In [14]:
import numpy as np
import math

# The predictions are in three separate numpy arrays.  Concatenate them into one.  
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0

out = titanic["Survived"][titanic["Survived"] == predictions]
print(len(out))
print(len(titanic["Survived"]))

accuracy = math.floor(len(out))/len(titanic["Survived"])
print "accuracy score for linear regression ", accuracy

698
891
accuracy score for linear regression  0.783389450056


Noticeably the accuracy for this model is 0.7833 or 78.33%. This is okay. This isn't larger because our fit only has three training sets and the outcomes from the linear regression are all over the place. So in order to improve the accuracy, we can use logistic regression, it will map the value to 0 or 1, depending on how extreme it is. This is done in the cell below. Our accuracy improved by a tiny bit, 0.00448933782 or .44%

In [15]:
from sklearn import cross_validation
# Import the linear regression class
from sklearn.linear_model import LogisticRegression

# Initialize our algorithm
alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print("mean score for logistic regression ", scores.mean())

('mean score for logistic regression ', 0.78787878787878773)


Now we can do everything we did above to our test data. After this process, we can be ready for submission!

In [16]:
titanic_test = pandas.read_csv("test.csv")
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0 
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

In [17]:
# Initialize the algorithm class
alg = LogisticRegression(random_state=1)

# Train the algorithm using all the training data
alg.fit(titanic[predictors], titanic["Survived"])

# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pandas.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })
submission.to_csv("kaggle.csv", index = False) 

In [18]:
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
print("mean score for logistic regression model " , scores.mean())

('mean score for logistic regression model ', 0.78787878787878773)


0.787878787879 or 78.78% is the mean score for this algorithm fit on the test data.

For the first change in my model, I would be interested to see how another model affects the results. i think linear fit is very limiting and the data might not be linear. It just doesn't feel right to capture the messiniess that is this data with a linear fit. 

In [19]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

# Initialize random forest algorithm with the default paramters
# n_estimators is the number of trees we want to make
# min_samples_split is the minimum number of rows we need to make a split
# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=4, min_samples_leaf=2)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)

# Take the mean of the scores (because we have one for each fold)
print("mean score for random forest model ", scores.mean())

('mean score for random forest model ', 0.81930415263748602)


This 81.93 % is a better step up from the 78.78% before with Logistic Regression. This is probably due to the nature of the randomness of this random forrest model and averaging the outcomes of all of the decision trees. I would be curious to play with the parameters and see what that does for the model.

In [20]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

# Initialize random forest algorithm with the default paramters
# n_estimators is the number of trees we want to make
# min_samples_split is the minimum number of rows we need to make a split
# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)
alg = RandomForestClassifier(random_state=1, n_estimators=13, min_samples_split=8, min_samples_leaf=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)

# Take the mean of the scores (because we have one for each fold)
print("mean score for random forrest model after fidding with parameters", scores.mean())

('mean score for random forrest model after fidding with parameters', 0.82491582491582494)


In [21]:
submission = pandas.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })
submission.to_csv("kaggle_upload.csv", index = False) 

After playing with the parameters, the best I could do is 82.49%, better than 81.93% computed before. This is mainly due to the fact that there are now more decision trees each time we split off. The randomness prevents the algorithm from overfitting. Therefore, more randomness and more decision tree outcomes to average, the closers we get to the actual value.

When we translated the port names into numerical values in order to include the embankment ports as a part of our predictors, we said that S = 0, C = 1, and Q = 2. I think it would be interesting to change this to have three more columns where the first column would be S = 0 or 1, the second column would be C = 0 or 1, and the third column would be Q = 0 or 1. Currently, embarking at different ports provides different mathematical advantages. Embarking at port Q provides 2 whereas embarking at port S provides 0 points. These port values should not be multiples of each other. The ports should all be equal, this might contribute to overfitting. I think this would get us closer to the true fit. 

Previously we changed it so that under Embarkment, there would be 0, 1, or 2, corresponding to the Embarkment port. S = 0. C = 1. Q = 2. Now we will make new columns named "S", "C", and "Q" where the values under them will be 1 if that is the passenger's embarkment port and 0 if it is not their port. 

In [22]:
titanic.loc[titanic["Embarked"] == 0, "S"] = 1
titanic.loc[titanic["Embarked"] == 1, "C"] = 1
titanic.loc[titanic["Embarked"] == 2, "Q"] = 1

##fill in the remaining values, the zeros, in these columns 
##fill in what these columns are not
titanic.loc[titanic["S"].isnull(), "S"] = 0
titanic.loc[titanic["C"].isnull(), "C"] = 0
titanic.loc[titanic["Q"].isnull(), "Q"] = 0

In [23]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,S,C,Q
0,1,0,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.2500,,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,C85,1,0,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.9250,,0,1,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1000,C123,0,1,0,0
4,5,0,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.0500,,0,1,0,0
5,6,0,3,"Moran, Mr. James",0,28,0,0,330877,8.4583,,2,0,0,1
6,7,0,1,"McCarthy, Mr. Timothy J",0,54,0,0,17463,51.8625,E46,0,1,0,0
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2,3,1,349909,21.0750,,0,1,0,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27,0,2,347742,11.1333,,0,1,0,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14,1,0,237736,30.0708,,1,0,1,0


Now I will try including these ports in the predictors section and run it with both the LinearRegression function and the random forrests function.

In [24]:
##new linear regression 
# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "S", "C", "Q"]

# Initialize our algorithm class
alg = LinearRegression()
# Generate cross validation folds for the titanic dataset.  It return the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    #training the algorithm 
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (titanic[predictors].iloc[train,:])
    # The target we're using to train the algorithm.
    train_target = titanic["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    #fitting the alogrithm 
    alg.fit(train_predictors, train_target)
    #now make predictions
    # We can now make predictions on the test fold
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)

In [25]:
import numpy as np
import math

# The predictions are in three separate numpy arrays.  Concatenate them into one.  
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0

out = titanic["Survived"][titanic["Survived"] == predictions]
print(len(out))
print(len(titanic["Survived"]))

accuracy = math.floor(len(out))/len(titanic["Survived"])
print "accuracy for linear regression ", accuracy

702
891
accuracy for linear regression  0.787878787879


Interestingly enough with the embarkment change and the Linear Regression, the accuracy of this model now is 78.78%. This was once the accuracy with logistic regression with the initial data. Therefore, this is an improvement. 

In [26]:
titanic_test = pandas.read_csv("test.csv")
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0 
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic_test.loc[titanic_test["Embarked"] == 0, "S"] = 1
titanic_test.loc[titanic_test["Embarked"] == 1, "C"] = 1
titanic_test.loc[titanic_test["Embarked"] == 2, "Q"] = 1

##fill in the remaining values, the zeros, in these columns 
##fill in what these columns are not
titanic_test.loc[titanic_test["S"].isnull(), "S"] = 0
titanic_test.loc[titanic_test["C"].isnull(), "C"] = 0
titanic_test.loc[titanic_test["Q"].isnull(), "Q"] = 0

In [27]:
predictors_updated = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "S", "Q", "C"]
# Initialize the algorithm class
alg = LogisticRegression(random_state=1)

# Train the algorithm using all the training data
alg.fit(titanic[predictors_updated], titanic["Survived"])

# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors_updated])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pandas.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })
submission.to_csv("kaggle.csv", index = False)

In [28]:
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
print("mean score for logsitic regression with S, C, and Q embarkment columns", scores.mean())

('mean score for logsitic regression with S, C, and Q embarkment columns', 0.78900112233445563)


With the logistic regression and the newly added embarkment columns, the new score is 78.9%. This is an increase from the original logistic regression with the initial embarkment column. 

In [29]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

# Initialize random forest algorithm with the default paramters
# n_estimators is the number of trees we want to make
# min_samples_split is the minimum number of rows we need to make a split
# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=4, min_samples_leaf=2)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors_updated], titanic["Survived"], cv=3)

# Take the mean of the scores (because we have one for each fold)
print"mean score of random forest model with new recodes", scores.mean()

mean score of random forest model with new recodes 0.822671156004


In [30]:
submission = pandas.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })
submission.to_csv("kaggle_new.csv", index = False)

With the random forest model and the embarkment ports split into three new columns, the new mean score for this model, including these three new columns, is 82.26%. This is an improvement. Not only is the random forest model a great model, but also eliminating the excess weight from the port prevented slight overfitting. 