In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

In [2]:
spark = SparkSession.builder.appName('logreg').getOrCreate()

In [3]:
# Get Data
df = spark.read.csv('titanic.csv',header=True,inferSchema=True)

In [4]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [5]:
# Predict survivability (YES/NO) based on features
# Binary Classification!

In [7]:
myColumns = df.select(['Survived','Pclass','Sex','Age','SibSp',
                      'Parch','Fare','Embarked'])

In [8]:
# Extreme way of dealing with data!
myFinalData = myColumns.na.drop()

In [9]:
# Let us handle categorical columns!
from pyspark.ml.feature import (VectorAssembler, VectorIndexer,
                                OneHotEncoder, StringIndexer)

In [23]:
# Index string columns
genderIndexer = StringIndexer(inputCol='Sex',
                              outputCol='SexIndex')
embarkIndexer = StringIndexer(inputCol='Embarked',
                              outputCol='EmbarkIndex')

In [24]:
# One-Hot Encode the Indexes to their actual Categories
# Column Data              -> A       B       C
# Through Indexing         -> 0       1       2
# Through One-Hot Encoding -> [1,0,0] [1,0,1] [1,1,0]

genderEncoder = OneHotEncoder(inputCol='SexIndex',
                              outputCol='SexVec')
embarkEncoder = OneHotEncoder(inputCol='EmbarkIndex',
                              outputCol='EmbarkVec')

In [25]:
# Create the Vector Assembler to transform your data for ML
assembler = VectorAssembler(inputCols=['Pclass','SexVec',
                                       'Age','SibSp',
                                       'Parch','Fare',
                                       'EmbarkVec'],
                            outputCol='features')

In [26]:
# Create a pipeline for your Machine Learning Model
from pyspark.ml import Pipeline

In [27]:
# Initiate the model
logReg = LogisticRegression(featuresCol='features',
                            labelCol='Survived')

In [28]:
# PIPELINE CREATION
pipeline = Pipeline(stages=[genderIndexer,embarkIndexer,
                            genderEncoder,embarkEncoder,
                            assembler,logReg])

In [29]:
# Split the data
train_data,test_data = myFinalData.randomSplit([0.7,0.3])

In [30]:
# Fit the Model to train_data
fitModel = pipeline.fit(train_data)

In [31]:
# Test the model on test_data
results = fitModel.transform(test_data)

In [42]:
results.select(['Survived','prediction']).show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [32]:
# EVALUATE YOUR MODEL
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [36]:
# 'prediction' is the default name of the column
eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                     labelCol='Survived')

In [43]:
# Area Under the Curve
AUC = eval.evaluate(results)

In [44]:
AUC # 76.4% Area Under Curve

0.7642325025292008