Titanic Decision Tree Model
----------
1. Read-in Training Data
2. Model Tree
3. Predict Training Set
4. Calculate MSE
5. Calculate Accuracy
6. Predict Test Set

In [None]:
titanic = spark.read.option("header", True).csv("/data/training/titanic/train.csv")
titanic.createOrReplaceTempView("titanic")
spark.sql("SELECT * from titanic LIMIT 10").show()

In [None]:
from pyspark.mllib.regression import LabeledPoint
def num(s):
    try:
        return int(s)
    except (ValueError, TypeError):
        try:
            return float(s)
        except (ValueError, TypeError):
            return 0
#
def parse_train_data(x):
    pass_id = x[0]
    survived = num(x[1])
    pclass = num(x[2])
    # sex
    sex=0
    if x[4]=='male':
        sex = 1
    age=0
    age = num(x[5])
    sibsp = 0
    sibsp = num(x[6])
    parch = 0
    parch = num(x[7])
    fare = 0
    fare = num(x[9])
    cabin = x[10] # not now, categorical
    # return labelled point
    return LabeledPoint(survived,[pclass,sex,age,sibsp,parch,fare]) 

In [None]:
df_train = spark.table("titanic").rdd.map(lambda row: parse_train_data(row))

In [None]:
df_train.count()

In [None]:
df_train.take(3)

### Routines to separate the labels & features out of a Labelled Point dataset

In [None]:
def get_label(x):
    return x.label
#
def get_features(x):
    return x.features

In [None]:
from numpy import array
pass_labels = df_train.map(lambda x: get_label(x))
pass_features = df_train.map(lambda x: get_features(x))

### Train a DecisionTree Model

In [None]:
from pyspark.mllib.tree import DecisionTree
model = DecisionTree.trainClassifier(df_train, numClasses=2,categoricalFeaturesInfo={})

###Let us see how the model looks

In [None]:
print(model)
print(model.toDebugString())

In [None]:
def get_label(x):
    return x.label
#
def get_features(x):
    return x.features

In [None]:
print(pass_labels.count())
print(pass_features.count())

### How good is our model on the training set ?

### Predict the training set

In [None]:
predictions = model.predict(pass_features)
predictions.count()

### Zip & calculate MSE

In [None]:
labelsAndPredictions = df_train.map(lambda lp: lp.label).zip(model.predict(pass_features))

In [None]:
trainMSE = labelsAndPredictions.map(lambda tup : (tup[0] - tup[1])**2).sum() / float(df_train.count())
print(trainMSE)

## Calculate Accuracy

In [None]:
from operator import add
seqOp = (lambda acc, x: acc + (x[0] == x[1]))
train_correct = labelsAndPredictions.aggregate(0, seqOp, add)
accuracy = train_correct / float(df_train.count())
print('%2.2f%s' % (accuracy*100,'%'))

Time to Visit the Test data & predict using the Model
---

In [None]:
titanic_test = spark.read.option("header", True).csv("/data/training/titanic/test.csv")
titanic_test.createOrReplaceTempView("titanic_test")
spark.sql("SELECT * from titanic_test LIMIT 10").show()

##Map through a row RDD and return the appropriate features as LabeledPoint

In [None]:
from pyspark.mllib.regression import LabeledPoint
def num(s):
    try:
        return int(s)
    except (ValueError, TypeError):
        try:
            return float(s)
        except (ValueError, TypeError):
            return 0
#
def parse_test_data(x):
    pass_id = int(x[0])
    pclass = num(x[1])
    # sex
    sex=0
    if x[3]=='male':
        sex = 1
    age=0
    age = num(x[4])
    sibsp = 0
    sibsp = num(x[5])
    parch = 0
    parch = num(x[6])
    fare = 0
    fare = num(x[8])
    # return labelled point
    return LabeledPoint(pass_id,[pclass,sex,age,sibsp,parch,fare]) 
    #
    # just using labeledPoint to keep the Passenger Id which we need at the end for submission

###Parse Test Dataset to LabeledPoint

In [None]:
df_test = spark.table("titanic_test").rdd.map(lambda row: parse_test_data(row))

In [None]:
df_test.take(1)

## Extract the features & Predict using Model

In [None]:
test_features = df_test.map(lambda x: get_features(x))
test_pred = model.predict(test_features).coalesce(1)

In [None]:
test_pred.count()

In [None]:
df_test.count()

In [None]:
test_pred.take(1)

In [None]:
sub_csv = df_test.coalesce(1)
sub_csv_1 = sub_csv.map(lambda lp: int(lp.label)).zip(test_pred.map(lambda x: num(x)))


In [None]:
sub_csv_1.count()

In [None]:
sub_csv_1.take(3)

In [None]:
from pyspark.sql import SQLContext
from pyspark.sql.types import *

schema = StructType([StructField("PassengerId", IntegerType(), False),StructField("Survived", IntegerType(), False)])
s_df = sqlContext.createDataFrame(sub_csv_1, schema)
s_df.registerTempTable("SolutionThree")

In [None]:
spark.sql("SELECT * FROM SolutionThree").show()


In [None]:
# another way to compute our "performance" regarding this model
from pyspark.mllib.evaluation import BinaryClassificationMetrics
# Instantiate metrics object
metrics = BinaryClassificationMetrics(labelsAndPredictions)

# Area under precision-recall curve
print("Area under PR = %s" % metrics.areaUnderPR)

# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)


## if you were to submit to Kaggle
## You'd get a score of ~0.775124, Rank : 1147

### Would Random Forest do any Better ?

## Coding Exercise #31
###1. Model RandomForest
###2. Predict Test Set
###3. Compute "Performance"
###4. Did we do any better ?