In [121]:
# Import required libraries
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import cross_validation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


In [122]:
# Read in data file
train = pandas.read_csv('C:\\Users\\blcol\\Documents\\Titanic\\train.csv')

In [123]:
# Process data

# Fill in missing ages and fares with median by Pclass and Sex and Pclass, respectively
train['Age'] = train.groupby(['Pclass','Sex'])['Age'].apply(lambda x: x.fillna(x.mean()))
train['Fare'] = train.groupby(['Pclass'])['Fare'].apply(lambda x: x.fillna(x.mean()))

# Machine learning algorithms can only make sense of numeric data.
# Replace sex strings with numeric 0 for male and 1 for female
train.loc[train['Sex'] == 'male','Sex'] = 0
train.loc[train['Sex'] == 'female','Sex'] = 1

# Assumed nan Embarked values to be S (the most common)
train['Embarked'] = train['Embarked'].fillna('S')

# Replace embarked values S(0), C(1), Q(2)
train.loc[train['Embarked'] == 'S','Embarked'] = 0
train.loc[train['Embarked'] == 'C','Embarked'] = 1
train.loc[train['Embarked'] == 'Q','Embarked'] = 2

# Create new family size feature
train['Family_Size'] =  train[['Parch', 'SibSp']].sum(axis=1)

# The columns we'll use to predict the target
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
#predictors = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Family_Size']

In [124]:
# Select algorithm

# set random seed
rs = 42

# Initialize our algorithm classes
alg = LogisticRegression(random_state=rs)
alg2 = AdaBoostClassifier(random_state=rs)
alg3 = DecisionTreeClassifier(random_state=rs, max_depth=5)
alg4 = SVC(random_state=rs, kernel="linear", C=0.025)
alg5 = SVC(gamma=2, C=1)
alg6 = KNeighborsClassifier(3)
alg7 = GaussianProcessClassifier()
alg8 = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
alg9 = MLPClassifier(alpha=1)
alg10 = GaussianNB()
alg11 = QuadraticDiscriminantAnalysis()

# Compute the accuracy score for all the cross validation folds
scores = cross_validation.cross_val_score(alg, train[predictors], train['Survived'], cv=5)
scores2 = cross_validation.cross_val_score(alg2, train[predictors], train['Survived'], cv=5)
scores3 = cross_validation.cross_val_score(alg3, train[predictors], train['Survived'], cv=5)
scores4 = cross_validation.cross_val_score(alg4, train[predictors], train['Survived'], cv=5)
scores5 = cross_validation.cross_val_score(alg5, train[predictors], train['Survived'], cv=5)
scores6 = cross_validation.cross_val_score(alg6, train[predictors], train['Survived'], cv=5)
scores7 = cross_validation.cross_val_score(alg7, train[predictors], train['Survived'], cv=5)
scores8 = cross_validation.cross_val_score(alg8, train[predictors], train['Survived'], cv=5)
scores9 = cross_validation.cross_val_score(alg9, train[predictors], train['Survived'], cv=5)
scores10= cross_validation.cross_val_score(alg10, train[predictors], train['Survived'], cv=5)
scores11 = cross_validation.cross_val_score(alg11, train[predictors], train['Survived'], cv=5)

# Take the mean of the scores
print(scores.mean())
print(scores2.mean())
print(scores3.mean())
print(scores4.mean())
print(scores5.mean())
print(scores6.mean())
print(scores7.mean())
print(scores8.mean())
print(scores9.mean())
print(scores10.mean())
print(scores11.mean())

0.80137255048
0.820498995137
0.813719759684
0.786708747859
0.645309344287
0.709386950149
0.710560904068
0.814849561332
0.787907880985
0.786790491622
0.796884303673


In [110]:
# Tune the best algorithm
alg2 = AdaBoostClassifier(random_state=rs)
scores2 = cross_validation.cross_val_score(alg2, train[predictors], train['Survived'], cv=5)
print(scores2.mean())

# Review feature importance
alg2.fit(train[predictors],train['Survived'])
importances = alg2.feature_importances_
display(importances)

0.803676306047


array([ 0.06,  0.06,  0.38,  0.38,  0.02,  0.1 ])

In [111]:
# Read in Test File
test = pandas.read_csv('C:\\Users\\blcol\\Documents\\Titanic\\test.csv')

In [112]:
# Process Test data

# Fill nans
test['Age'] = test.groupby(['Pclass','Sex'])['Age'].apply(lambda x: x.fillna(x.mean()))
test['Fare'] = test.groupby(['Pclass'])['Fare'].apply(lambda x: x.fillna(x.mean()))

test.loc[test['Sex'] == 'male','Sex'] = 0
test.loc[test['Sex'] == 'female','Sex'] = 1

test['Embarked'].fillna('S')
test.loc[test['Embarked'] == 'S', 'Embarked'] = 0
test.loc[test['Embarked'] == 'C', 'Embarked'] = 1
test.loc[test['Embarked'] == 'Q', 'Embarked'] = 2

test['Family_Size'] =  test[['Parch', 'SibSp']].sum(axis=1)

In [113]:
## Make Predictions

# Train the algorithm using all the training data
alg2.fit(train[predictors],train['Survived'])

# Make predictions using the test set
predictions = alg2.predict(test[predictors])

# Create output of required columns
output = pandas.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})

In [114]:
output.to_csv('output.csv', index=False)

In [115]:
display(output)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0
