# Classification of Titanic Data

Predict Survival of passengers on the Titanic
Reference: https://ww2.amstat.org/publications/jse/v3n3/datasets.dawson.html

In [51]:
# Machine Learning Library of PySpark
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row, SQLContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.util import MLUtils

import os
import sys
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *

from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from numpy import array

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Library for confusion matrix, precision, test error
from pyspark.mllib.evaluation import MulticlassMetrics
# Library For Area under ROC curve and Area under precision-recall curve
from pyspark.mllib.evaluation import BinaryClassificationMetrics

# Assign resources to the application
sqlContext = SQLContext(sc)


The data will be loaded into an array.
This is the summary of the data structure, including the column position and name.
The first filed starts from position 0. 

0 Name    -  Passenger first and last name.

1 PClass  -  Passenger class (1st, 2nd, or 3rd)

2 Age

3 Sex

4 Survived -  1 if the passenger survived;  0 if the passenger did not survive

5 PersonID

Label is a target variable.  PersonInfo is a list of independent variables besides unique identifier

Load the titanic csv file into a spark dataframe

In [52]:

import ibmos2spark

# @hidden_cell
credentials = {
    'auth_url': 'https://identity.open.softlayer.com',
    'project_id': '7984a968f14449858d826f8ba838fbb3',
    'region': 'dallas',
    'user_id': '51e14119a08842f0a9311ea21a5debfc',
    'username': 'member_c8a299d593b1d03a3e131a4df7aec8a60c5f34ce',
    'password': 'DWy^2pn8K=WMrUIu'
}

configuration_name = 'os_85a89ae392d2473988f606b7870013f9_configs'
bmos = ibmos2spark.bluemix(sc, credentials, configuration_name)

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df_data_2 = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option('header', 'true')\
  .load(bmos.url('DefaultProjectagarner18studentumucedu', 'Titanic.csv'))
df_data_2.take(5)


[Row(Name=u'Allen Miss Elisabeth Walton', PClass=u'1st', Age=u'29', Sex=u'female', Survived=u'1', PersonID=u'1'),
 Row(Name=u'Allison Miss Helen Loraine', PClass=u'1st', Age=u'2', Sex=u'female', Survived=u'0', PersonID=u'2'),
 Row(Name=u'Allison Mr Hudson Joshua Creighton', PClass=u'1st', Age=u'30', Sex=u'male', Survived=u'0', PersonID=u'3'),
 Row(Name=u'Allison Mrs Hudson JC (Bessie Waldo Daniels)', PClass=u'1st', Age=u'25', Sex=u'female', Survived=u'0', PersonID=u'4'),
 Row(Name=u'Allison Master Hudson Trevor', PClass=u'1st', Age=u'0.92', Sex=u'male', Survived=u'1', PersonID=u'5')]

Look at the data frame schema

In [53]:
df_data_2.printSchema()

root
 |-- Name: string (nullable = true)
 |-- PClass: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Survived: string (nullable = true)
 |-- PersonID: string (nullable = true)



In [54]:
from pyspark.ml.feature import StringIndexer

indexer_PClass = StringIndexer(inputCol="PClass", outputCol="PClassIndex")
indexed = indexer_PClass.fit(df_data_2).transform(df_data_2)
indexer_Sex = StringIndexer(inputCol="Sex", outputCol="SexIndex")
indexed_2 = indexer_Sex.fit(indexed).transform(indexed)
indexed_2.show()

+--------------------+------+----+------+--------+--------+-----------+--------+
|                Name|PClass| Age|   Sex|Survived|PersonID|PClassIndex|SexIndex|
+--------------------+------+----+------+--------+--------+-----------+--------+
|Allen Miss Elisab...|   1st|  29|female|       1|       1|        1.0|     1.0|
|Allison Miss Hele...|   1st|   2|female|       0|       2|        1.0|     1.0|
|Allison Mr Hudson...|   1st|  30|  male|       0|       3|        1.0|     0.0|
|Allison Mrs Hudso...|   1st|  25|female|       0|       4|        1.0|     1.0|
|Allison Master Hu...|   1st|0.92|  male|       1|       5|        1.0|     0.0|
|   Anderson Mr Harry|   1st|  47|  male|       1|       6|        1.0|     0.0|
|Andrews Miss Korn...|   1st|  63|female|       1|       7|        1.0|     1.0|
|Andrews Mr Thomas jr|   1st|  39|  male|       0|       8|        1.0|     0.0|
|Appleton Mrs Edwa...|   1st|  58|female|       1|       9|        1.0|     1.0|
|Artagaveytia Mr R...|   1st

In [55]:
data = indexed_2['PClassIndex', 'Age', 'SexIndex', 'Survived']

In [56]:
data.show()

+-----------+----+--------+--------+
|PClassIndex| Age|SexIndex|Survived|
+-----------+----+--------+--------+
|        1.0|  29|     1.0|       1|
|        1.0|   2|     1.0|       0|
|        1.0|  30|     0.0|       0|
|        1.0|  25|     1.0|       0|
|        1.0|0.92|     0.0|       1|
|        1.0|  47|     0.0|       1|
|        1.0|  63|     1.0|       1|
|        1.0|  39|     0.0|       0|
|        1.0|  58|     1.0|       1|
|        1.0|  71|     0.0|       0|
|        1.0|  47|     0.0|       0|
|        1.0|  19|     1.0|       1|
|        1.0|  50|     1.0|       1|
|        1.0|  24|     0.0|       0|
|        1.0|  36|     0.0|       0|
|        1.0|  37|     0.0|       1|
|        1.0|  47|     1.0|       1|
|        1.0|  26|     0.0|       1|
|        1.0|  25|     0.0|       0|
|        1.0|  25|     0.0|       1|
+-----------+----+--------+--------+
only showing top 20 rows



Convert from string to double type

In [57]:
from pyspark.sql.functions import col  # for indicating a column using a string in the line below
data = data.select([col(c).cast("double").alias(c) for c in data.columns])

Validate that the type is double

In [58]:
data.printSchema()

root
 |-- PClassIndex: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- SexIndex: double (nullable = true)
 |-- Survived: double (nullable = true)



Find correlations between variables

In [84]:
from pyspark.mllib.stat import Statistics
age_survived=data.stat.corr('Age', 'Survived')
age_sex=data.stat.corr('Age', 'SexIndex')
age_pclass=data.stat.corr('Age', 'PClassIndex')

print("Correlation with age:")
print("survived: "+ str(age_survived))
print("sex: "+ str(age_sex))
print("pclass: "+ str(age_pclass))

Correlation with age:
survived: -0.0612539137238
sex: -0.0551378307708
pclass: 0.135561650597


In [86]:
sex_survived=data.stat.corr('SexIndex', 'Survived')
sex_age=data.stat.corr('SexIndex', 'Age')
sex_pclass=data.stat.corr('SexIndex', 'PClassIndex')

print("Correlation with sex:")
print("survived: "+ str(sex_survived))
print("age: "+ str(sex_age))
print("pclass: "+ str(sex_pclass))

Correlation with sex:
survived: 0.540627642826
age: -0.0551378307708
pclass: 0.077151133994


In [88]:
pclass_survived=data.stat.corr('PClassIndex', 'Survived')
pclass_age=data.stat.corr('PClassIndex', 'Age')
pclass_sex=data.stat.corr('PClassIndex', 'SexIndex')

print("Correlation with pclass:")
print("survived: "+ str(pclass_survived))
print("age: "+ str(pclass_age))
print("sex: "+ str(pclass_sex))

Correlation with pclass:
survived: 0.201335044942
age: 0.135561650597
sex: 0.077151133994


In [59]:
  # Split the data into training and test sets (20% held out for testing)
(trainingData, testData) = data.randomSplit([0.95, 0.5])

Convert the spark dataframe into a labeledpoint rdd

In [60]:
trainingData  = trainingData.rdd.map(lambda x: LabeledPoint(x[3], x[0:2]))
testData = testData.rdd.map(lambda x: LabeledPoint(x[3], x[0:2]))

# Simple Decision Tree Model

Train a decision tree model 

In [61]:
# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
from pyspark.mllib.tree import DecisionTree
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},impurity='gini', maxDepth=10, maxBins=32)

Get accuracy measures for the decision tree 

In [62]:
# Evaluate decision tree model on training instances 
predictions = model.predict(trainingData.map(lambda x: x.features))
labelsAndPredictions = trainingData.map(lambda lp: lp.label).zip(predictions)
trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(trainingData.count())
trainAcc = labelsAndPredictions.filter(lambda (v, p): v == p).count() / float(trainingData.count())
print('Training Error = ' + str(trainErr))
print('Training Accuracy = ' + str(trainAcc))
print('Learned classification random forest model:')
print(model.toDebugString())

Training Error = 0.247035573123
Training Accuracy = 0.752964426877
Learned classification random forest model:
DecisionTreeModel classifier of depth 10 with 119 nodes
  If (feature 0 <= 0.0)
   If (feature 1 <= 7.0)
    If (feature 1 <= 2.0)
     Predict: 1.0
    Else (feature 1 > 2.0)
     Predict: 1.0
   Else (feature 1 > 7.0)
    If (feature 1 <= 60.0)
     If (feature 1 <= 23.0)
      If (feature 1 <= 13.0)
       Predict: 0.0
      Else (feature 1 > 13.0)
       If (feature 1 <= 18.0)
        If (feature 1 <= 17.0)
         Predict: 0.0
        Else (feature 1 > 17.0)
         Predict: 0.0
       Else (feature 1 > 18.0)
        If (feature 1 <= 21.0)
         If (feature 1 <= 19.0)
          Predict: 0.0
         Else (feature 1 > 19.0)
          If (feature 1 <= 20.0)
           Predict: 0.0
          Else (feature 1 > 20.0)
           Predict: 0.0
        Else (feature 1 > 21.0)
         If (feature 1 <= 22.0)
          Predict: 0.0
         Else (feature 1 > 22.0)
          Pre

In [63]:
# Evaluate decision tree model on test instances 
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
testAcc = labelsAndPredictions.filter(lambda (v, p): v == p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Test Accuracy = ' + str(testAcc))

Test Error = 0.376
Test Accuracy = 0.624


# Random Forest Model

Build a random forest model

In [64]:
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.util import MLUtils

# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Note: Use larger numTrees in practice.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
model2 = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},numTrees=8000, featureSubsetStrategy='all',impurity='gini', maxDepth=25, maxBins=15)

Get accuracy measures for random forest model on the training set

In [65]:
# Evaluate random forest model on training instances and compute test error
predictions2 = model2.predict(trainingData.map(lambda x: x.features))
labelsAndPredictions2 = trainingData.map(lambda lp: lp.label).zip(predictions2)
trainErr2 = labelsAndPredictions2.filter(lambda (v, p): v != p).count() / float(trainingData.count())
trainAcc2 = labelsAndPredictions2.filter(lambda (v, p): v == p).count() / float(trainingData.count())
print('Training Error = ' + str(trainErr2))
print('Training Accuracy = ' + str(trainAcc2))
print('Learned classification random forest model:')

Training Error = 0.292490118577
Training Accuracy = 0.707509881423
Learned classification random forest model:


In [66]:
# Evaluate random forest model on test instances 
predictions = model2.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr2 = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
testAcc2 = labelsAndPredictions.filter(lambda (v, p): v == p).count() / float(testData.count())
print('Test Error = ' + str(testErr2))
print('Test Accuracy = ' + str(testAcc2))

Test Error = 0.348
Test Accuracy = 0.652


# Gradient Boosting Trees Model

build the gradient boosting model 

In [67]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils
# Train a GradientBoostedTrees model.
model3 = GradientBoostedTrees.trainClassifier(trainingData,categoricalFeaturesInfo={}, numIterations=300)

Evaluate perforamance of gradient boosting model

In [68]:
# Evaluate gradient boosting on training instances and compute test error
predictions3 = model3.predict(trainingData.map(lambda x: x.features))
labelsAndPredictions3 = trainingData.map(lambda lp: lp.label).zip(predictions3)
trainErr3 = labelsAndPredictions3.filter(lambda (v, p): v != p).count() / float(trainingData.count())
trainAcc3 = labelsAndPredictions3.filter(lambda (v, p): v == p).count() / float(trainingData.count())
print('Training Error = ' + str(trainErr3))
print('Training Accuracy = ' + str(trainAcc3))

Training Error = 0.243083003953
Training Accuracy = 0.756916996047


In [69]:
# Evaluate gradient boosting model on test instances 
predictions = model3.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr3 = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
testAcc3 = labelsAndPredictions.filter(lambda (v, p): v == p).count() / float(testData.count())
print('Test Error = ' + str(testErr3))
print('Test Accuracy = ' + str(testAcc3))

Test Error = 0.356
Test Accuracy = 0.644


# Support Vector Machine

In [70]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel
model5 = SVMWithSGD.train(trainingData, iterations=8000)

In [71]:
# Evaluate svm model on training instances
predictions5 = model5.predict(trainingData.map(lambda x: x.features))
labelsAndPredictions5 = trainingData.map(lambda lp: lp.label).zip(predictions5)
trainErr5 = labelsAndPredictions5.filter(lambda (v, p): v != p).count() / float(trainingData.count())
trainAcc5 = labelsAndPredictions5.filter(lambda (v, p): v == p).count() / float(trainingData.count())
print('Training Error = ' + str(trainErr5))
print('Training Accuracy = ' + str(trainAcc5))

Training Error = 0.381422924901
Training Accuracy = 0.618577075099


In [72]:
# Evaluate svm model on test instances 
predictions = model5.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr5 = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
testAcc5 = labelsAndPredictions.filter(lambda (v, p): v == p).count() / float(testData.count())
print('Test Error = ' + str(testErr5))
print('Test Accuracy = ' + str(testAcc5))

Test Error = 0.416
Test Accuracy = 0.584


# Compare Model Accuracies

In [75]:
# Compare Results of all models
print('Decision Tree: Training = ' + str(trainAcc) + ' Test = '+ str(testAcc))
print('Random Forest: Training = ' + str(trainAcc2) + ' Test = '+ str(testAcc2))
print('Gradient Boost: Training = ' + str(trainAcc3) + ' Test = '+ str(testAcc3))
print('Support Vector: Training = ' + str(trainAcc5) + ' Test = '+ str(testAcc5))

Decision Tree: Training = 0.752964426877 Test = 0.624
Random Forest: Training = 0.707509881423 Test = 0.652
Gradient Boost: Training = 0.756916996047 Test = 0.644
Support Vector: Training = 0.618577075099 Test = 0.584
