# Import libraries

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Logistic Regression Model").getOrCreate()

In [2]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.feature import OneHotEncoder, StringIndexer

# Load and verify data

In [3]:
data = spark.read.csv('resources/titanic.csv',header = True, inferSchema = True)

In [4]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [5]:
data.head(3)

[Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S'),
 Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C'),
 Row(PassengerId=3, Survived=1, Pclass=3, Name='Heikkinen, Miss. Laina', Sex='female', Age=26.0, SibSp=0, Parch=0, Ticket='STON/O2. 3101282', Fare=7.925, Cabin=None, Embarked='S')]

In [6]:
for item in data.head(1)[0]:
    print(item)

1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.25
None
S


In [7]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [8]:
my_cols = data.select(['Survived', 'Pclass','Sex', 'Age', 'SibSp', 'Parch','Fare','Embarked'])

In [9]:
final_data = my_cols.na.drop()

# Data Preprocessing

In [10]:
genderindexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")# convert string to numerical
genderencoder = OneHotEncoder(inputCol="SexIndex", outputCol="SexVec") 

embarkedindexer = StringIndexer(inputCol="Embarked", outputCol="EmbarkedIndex")# convert string to numerical
embarkedencoder = OneHotEncoder(inputCol="EmbarkedIndex", outputCol="EmbarkedVec") 

In [11]:
assembler = VectorAssembler(inputCols =['Pclass','SexVec', 'Age', 'SibSp', 'Parch','Fare','EmbarkedVec'],
                            outputCol='features')

# Create Model and Pipeline

In [12]:
classifier = LogisticRegression(featuresCol = 'features',labelCol = 'Survived')

In [13]:
pipeline = Pipeline(stages = [genderindexer,embarkedindexer,genderencoder,embarkedencoder,assembler,classifier])

# Train Test split

In [14]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [15]:
train_data.describe().show()

+-------+-------------------+------------------+------+------------------+------------------+-------------------+-----------------+--------+
|summary|           Survived|            Pclass|   Sex|               Age|             SibSp|              Parch|             Fare|Embarked|
+-------+-------------------+------------------+------+------------------+------------------+-------------------+-----------------+--------+
|  count|                497|               497|   497|               497|               497|                497|              497|     497|
|   mean| 0.4164989939637827|2.2595573440643864|  null|29.463943661971832|0.5311871227364185|0.41851106639839036|34.59908591549295|    null|
| stddev|0.49347498108276067|0.8346298950583962|  null|14.552294677607255|0.9480100121093916| 0.8411657835298109|53.93030383395764|    null|
|    min|                  0|                 1|female|              0.42|                 0|                  0|              0.0|       C|
|    max|    

In [16]:
test_data.describe().show()

+-------+------------------+------------------+------+------------------+------------------+-------------------+-----------------+--------+
|summary|          Survived|            Pclass|   Sex|               Age|             SibSp|              Parch|             Fare|Embarked|
+-------+------------------+------------------+------+------------------+------------------+-------------------+-----------------+--------+
|  count|               215|               215|   215|               215|               215|                215|              215|     215|
|   mean|0.3767441860465116|2.1953488372093024|  null|30.053906976744187|0.4744186046511628|0.46511627906976744|34.49366186046511|    null|
| stddev| 0.485700770013535|0.8422137852035905|  null|14.380066440473348|0.8902622272662752| 0.8846784412202671|50.69476375539366|    null|
|    min|                 0|                 1|female|              0.67|                 0|                  0|              0.0|       C|
|    max|           

# Fit Model

In [17]:
model = pipeline.fit(train_data)

In [18]:
pred_data = model.transform(test_data)

# Evaluate Model

In [19]:
pred_data.head(1)

[Row(Survived=0, Pclass=1, Sex='male', Age=22.0, SibSp=0, Parch=0, Fare=135.6333, Embarked='C', SexIndex=0.0, EmbarkedIndex=1.0, SexVec=SparseVector(1, {0: 1.0}), EmbarkedVec=SparseVector(2, {1: 1.0}), features=DenseVector([1.0, 1.0, 22.0, 0.0, 0.0, 135.6333, 0.0, 1.0]), rawPrediction=DenseVector([-0.9996, 0.9996]), probability=DenseVector([0.269, 0.731]), prediction=1.0)]

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol = 'Survived')

In [21]:
AUC = evaluator.evaluate(pred_data)

In [22]:
print(AUC)

0.8266077022295928
