In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local[*]").appName("titanic").getOrCreate()

In [3]:
df = spark.read.csv("titanic.csv", inferSchema=True, header=True)

In [4]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [5]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [6]:
my_cols = df.select(['Survived',
 'Pclass','Sex',
 'Age',
 'SibSp',
 'Parch','Fare','Embarked'
])

In [7]:
my_final_data = my_cols.na.drop()

In [8]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [10]:
gender_indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
gender_encoder = OneHotEncoder(inputCol="SexIndex", outputCol="SexVec")

In [11]:
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')

In [12]:
assembler = VectorAssembler(inputCols=['Pclass',
 'SexVec',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'EmbarkVec'], outputCol='features')

In [16]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [15]:
log_reg_titanic = LogisticRegression(featuresCol='features', labelCol='Survived')

In [17]:
pipeline = Pipeline(stages = [gender_indexer,
                    embark_indexer,
                    gender_encoder,
                    embark_encoder,
                    assembler,
                    log_reg_titanic])

In [18]:
train_titanic_data, test_titanic_data = my_final_data.randomSplit([0.7,.3])

In [19]:
fit_model = pipeline.fit(train_titanic_data)

In [20]:
results = fit_model.transform(test_titanic_data)

In [22]:
results.show()

+--------+------+------+----+-----+-----+-------+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|SexIndex|EmbarkIndex|       SexVec|    EmbarkVec|            features|       rawPrediction|         probability|prediction|
+--------+------+------+----+-----+-----+-------+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|       0|     1|  male|19.0|    3|    2|  263.0|       S|     0.0|        0.0|(1,[0],[1.0])|(2,[0],[1.0])|[1.0,1.0,19.0,3.0...|[0.52749301791695...|[0.62889820863787...|       0.0|
|       0|     1|  male|24.0|    0|    0|   79.2|       C|     0.0|        1.0|(1,[0],[1.0])|(2,[1],[1.0])|[1.0,1.0,24.0,0.0...|[-0.8442107182752...|[0.30064869985403...|       1.0|
|       0|     1|  male|31.0|    0|    0|50.4958|       S|     0.0|        0.0|(1,[0],[1.0

In [21]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [23]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="Survived")

In [24]:
AUC = my_eval.evaluate(results)

In [25]:
AUC

0.7946985446985447

In [26]:
spark.stop()