In [20]:
from pyspark.sql import SparkSession

In [49]:
spark = SparkSession.builder.appName('codealong').getOrCreate()

In [50]:
df_titanic = spark.read.csv('titanic.csv',header = True,inferSchema = True)

In [51]:
df_titanic.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [52]:
df_titanic.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [53]:
#get the columns needed for training. only the relevant cols that is going to affect training
my_cols = df_titanic.select(['Survived',
 'Pclass','Sex',
 'Age',
 'SibSp',
 'Parch', 'Fare',
 'Embarked'])

In [54]:
#drop missing data
my_cols = my_cols.na.drop(how = 'any')

In [55]:
#dealing with categorical data
from pyspark.ml.feature import StringIndexer,VectorAssembler,OneHotEncoder

In [56]:
# index then one hot encode the gender values
gender_indexer = StringIndexer(inputCol = 'Sex',outputCol = 'sex_indexed')
gender_encoder = OneHotEncoder(inputCol = 'sex_indexed',outputCol = 'sex_vec')

In [57]:
#index then one hot encode the embark values
embark_indexer = StringIndexer(inputCol = 'Embarked',outputCol = 'emabrked_indexed')
embark_encoder = OneHotEncoder(inputCol = 'emabrked_indexed',outputCol = 'emabrked_vec')

In [58]:
# features is the default output col name
aseembler = VectorAssembler(inputCols = ['Pclass','sex_vec','emabrked_vec','Age',
                                         'SibSp','Parch','Fare'],outputCol = 'features')

In [59]:
#create our pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [60]:
#create the logostic regression object 
logr_reg_titanic = LogisticRegression(featuresCol = 'features', labelCol = 'Survived')

In [61]:
#create the pipeline
pipeline = Pipeline(stages = [gender_indexer,embark_indexer,
                              gender_encoder,embark_encoder,
                              aseembler,logr_reg_titanic])

In [62]:
#split into training and testing 
train_data,test_data = my_cols.randomSplit([0.7,0.3])

In [63]:
train_data.columns

['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [64]:
#run the pipeline
fitted_model=pipeline.fit(train_data)

In [67]:
results = fitted_model.transform(test_data)

In [68]:
results.select(['prediction','survived'])

DataFrame[Survived: int, Pclass: int, Sex: string, Age: double, SibSp: int, Parch: int, Fare: double, Embarked: string, sex_indexed: double, emabrked_indexed: double, sex_vec: vector, emabrked_vec: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [70]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [71]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol = 'prediction',
                                          labelCol = 'Survived')

In [72]:
AUC = evaluator.evaluate(results)

In [73]:
AUC

0.7817039211259126