In [1]:
df = spark.sql('select* from titanic')

In [2]:
df.show(5)

In [3]:
df.printSchema()

In [4]:
df.columns

In [5]:
df2 = df.select(['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']) 

In [6]:
df2.show(5)

In [7]:
#dropiing the missing data
final_data = df2.na.drop()

In [8]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer)

In [9]:
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex') # transforming into 0,1
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec',dropLast=True) # creating dummay variables

In [10]:
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex') # transforming into 0,1
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec',dropLast=True) # creating dummay variables

In [11]:
assembler = VectorAssembler(inputCols=['Pclass', 'Age', 'SibSp', 'Parch', 'Fare','SexVec','EmbarkVec','Survived'],
                           outputCol ='features')

In [12]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [13]:
log_reg = LogisticRegression(featuresCol='features',labelCol='Survived')

In [14]:
## creating pipeline
# stages is a list that everything we want to inside the pipeline
pipeline = Pipeline(stages = [gender_indexer,embark_indexer,
                             gender_encoder,embark_encoder,
                             assembler,log_reg])

In [15]:
train,test = final_data.randomSplit([0.7,0.3])

In [16]:
log_reg_model = pipeline.fit(train)

In [17]:
results = log_reg_model.transform(test)
results.show(5)

In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [19]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')

In [20]:
auc = my_eval.evaluate(results)
auc