In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('LogReg').getOrCreate()

from pyspark.ml.classification import LogisticRegression

data = spark.read.csv('/FileStore/tables/titanic.csv', header = True,inferSchema= True)

data.printSchema()


In [2]:
#selecting useful cols

#data.columns

my_cols = data.select(['Survived','Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked'])

In [3]:
final_data = my_cols.na.drop()

from pyspark.ml.feature import (VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer)

gender_indexer = StringIndexer(inputCol= 'Sex',outputCol='SexIndex')
#One Hot Encoding
gender_encode = OneHotEncoder(inputCol = 'SexIndex',outputCol ='SexVec')

embark_indexer = StringIndexer(inputCol= 'Embarked',outputCol='EmbarkIndex')
#One Hot Encoding
embark_encode = OneHotEncoder(inputCol = 'EmbarkIndex',outputCol ='EmbarkVec')





In [4]:
assembler = VectorAssembler(inputCols = ['Pclass','SexVec','EmbarkVec','Age','SibSp','Parch','Fare'],outputCol = 'feature')

from pyspark.ml.classification import LogisticRegression

from pyspark.ml import Pipeline

log_reg_titanic = LogisticRegression(featuresCol = 'feature',labelCol='Survived')

pipeline = Pipeline(stages = [gender_indexer,embark_indexer,gender_encode,embark_encode,assembler,log_reg_titanic])


In [5]:
train_data , test_data = final_data.randomSplit([0.7,0.3])

fit_model = pipeline.fit(train_data)
result = fit_model.transform(test_data)

from pyspark.ml.evaluation import BinaryClassificationEvaluator

my_eval = BinaryClassificationEvaluator(rawPredictionCol = 'prediction',labelCol='Survived')



In [6]:
result.select(['Survived','prediction']).show()

In [7]:
AUC = my_eval.evaluate(result)

AUC