# Logistic Regression 
This is a code along of the famous titanic dataset.

In [None]:
# Import needed Package
import findspark
findspark.init('/home/eissa/spark-2.3.1-bin-hadoop2.7')
from pyspark.sql import SparkSession

In [None]:
# Create Spark Session
spark = SparkSession.builder.appName('myproj').getOrCreate()

In [None]:
# Read Titanic Data
data = spark.read.csv('titanic.csv',inferSchema=True,header=True)

In [None]:
# Display Schema
data.printSchema()

In [None]:
# Display Columns
data.columns

In [None]:
# select the needed features only
my_cols = data.select(['Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked'])

In [None]:
# drop empty rows
my_final_data = my_cols.na.drop()

In [None]:
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.feature import (VectorIndexer, StringIndexer, OneHotEncoder)

In [None]:
# Hot Encoding for Gender
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')

In [None]:
# Hot Encoding for Embarked
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')

In [None]:
# Create Features Vector
assembler = VectorAssembler(inputCols=['Pclass',
 'SexVec',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'EmbarkVec'],outputCol='features')

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
from pyspark.ml import Pipeline

In [None]:
# Create Logistic Regression Algorithm
log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='Survived')

In [None]:
# Define Pipeline
pipeline = Pipeline(stages=[gender_indexer,embark_indexer,
                           gender_encoder,embark_encoder,
                           assembler,log_reg_titanic])

In [None]:
#Split the Data to training & Testing
train_titanic_data, test_titanic_data = my_final_data.randomSplit([0.7,.3])
train_titanic_data.show()

In [None]:
# Execute the pipeline => the output will be a trained model
fit_model = pipeline.fit(train_titanic_data)

In [None]:
# test the model
results = fit_model.transform(test_titanic_data)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
# Evaluated the Model
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Survived')

In [None]:
results.select('Survived','prediction').show()

In [None]:
AUC = my_eval.evaluate(results)

In [None]:
AUC