In [0]:
import pyspark

In [0]:
df=spark.sql("select * from titanic_csv")

In [0]:
df.printSchema()

In [0]:
from pyspark.sql.types import *

# Change column type
df = df.withColumn("PassengerId", df["PassengerId"].cast(IntegerType()))
df = df.withColumn("Survived", df["Survived"].cast(IntegerType()))
df = df.withColumn("Pclass", df["Pclass"].cast(IntegerType()))
df = df.withColumn("Age", df["Age"].cast(IntegerType()))
df = df.withColumn("SibSp", df["SibSp"].cast(IntegerType()))
df = df.withColumn("Parch", df["Parch"].cast(IntegerType()))
df = df.withColumn("Fare", df["Fare"].cast(FloatType()))

df.printSchema()

In [0]:
df.columns

In [0]:
my_cols=df.select(['Survived',
 'Pclass','Sex',
 'Age',
 'SibSp',
 'Parch','Fare',
 'Embarked'])

In [0]:
final_data=my_cols.na.drop()

In [0]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,StringIndexer,OneHotEncoder)

In [0]:
gender_indexer=StringIndexer(inputCol='Sex',outputCol='SexIndex')
gender_encoder=OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')

Embarked_indexer=StringIndexer(inputCol='Embarked',outputCol='EmbarkedIndex')
Embarked_encoder=OneHotEncoder(inputCol='EmbarkedIndex',outputCol='EmbarkedVec')

In [0]:
assembler=VectorAssembler(inputCols=['Pclass','SexVec','EmbarkedVec','Age','SibSp','Parch','Fare'],outputCol='features')

In [0]:
from pyspark.ml.classification import LogisticRegression

In [0]:
from pyspark.ml import Pipeline

In [0]:
log_reg_titanic=LogisticRegression(featuresCol='features',labelCol='Survived')

In [0]:
pipeline=Pipeline(stages=[gender_indexer,Embarked_indexer,gender_encoder,Embarked_encoder,assembler,log_reg_titanic])

In [0]:
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [0]:
fit_model=pipeline.fit(train_data)

In [0]:
results=fit_model.transform(test_data)

In [0]:
results.show()

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
my_eval=BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')

In [0]:
results.select(['Survived','prediction']).show()

In [0]:
AUC=my_eval.evaluate(results)
AUC