In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Titanic').getOrCreate()

22/06/05 21:12:20 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.8.100 instead (on interface wlp1s0)
22/06/05 21:12:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/05 21:12:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read.csv('titanic.csv' , inferSchema= True , header= True)

In [5]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [6]:
df.show(2)

+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0| PC 17599|71.2833|  C85|       C|
+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+
only showing top 2 rows



In [7]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [8]:
# selecting the needed columns only 

mycolumns = df.select([
    'Survived',
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked' 
])

In [9]:
mycolumns.show(1)

+--------+------+----+----+-----+-----+----+--------+
|Survived|Pclass| Sex| Age|SibSp|Parch|Fare|Embarked|
+--------+------+----+----+-----+-----+----+--------+
|       0|     3|male|22.0|    1|    0|7.25|       S|
+--------+------+----+----+-----+-----+----+--------+
only showing top 1 row



In [10]:
# droping in the missing data; alternatively we can fill the missing data 

my_final_data = mycolumns.na.drop()

In [11]:
from pyspark.ml.feature import (VectorAssembler , VectorIndexer ,
                                OneHotEncoder , StringIndexer)

In [12]:
"""
What stringIndexer do:
    Lets say I have a column with A, B, C 
    StringIndexer will give me 0, 1, 2 

What OneHotEndcoding do: 
    Takes the 0, 1, 2 from the string indexer
    transform it to [1, 0, 0] 

For Ex. 

a value = A 

so, all the posiblities are: A, B, C 
with string indexer          0, 1, 2

that's means A = 0 

with OneHotEncoding A = [1, 0, 0]
"""

gender_indexer = StringIndexer(inputCol='Sex' , outputCol= 'SexIndex') 

gender_encoder = OneHotEncoder(inputCol='SexIndex' , outputCol= 'SexVec')

In [13]:
# doing the same thing for Embarked column 

embarked_indexer = StringIndexer(inputCol='Embarked' , outputCol= 'EmbarkedIndex') 

embarked_encoder = OneHotEncoder(inputCol='EmbarkedIndex' , outputCol= 'EmbarkedVec')

In [14]:
assumbler = VectorAssembler(inputCols=[
    'Pclass' , 
    'SexVec' , 
    'EmbarkedVec' , 
    'Age' , 
    'SibSp' , 
    'Parch' , 
    'Fare'
] , outputCol='features')

In [15]:
from pyspark.ml.classification import LogisticRegression 

In [16]:
## importing pipeline 
"""
because we want to set stagings for indexer and encoder 

"""
from pyspark.ml import Pipeline

In [17]:
log_reg_titanic = LogisticRegression(featuresCol= 'features' , labelCol='Survived')

In [19]:
# creating pipeline 

pipline  = Pipeline(stages=[
    gender_indexer , 
    embarked_indexer , 
    gender_encoder , 
    embarked_encoder, 
    assumbler, 
    log_reg_titanic
])

In [22]:
# split 

train_data, test_data = my_final_data.randomSplit([0.7 , 0.3])

In [24]:
# fiting the model 

fit_model = pipline.fit(train_data)

In [25]:
# transform the model to test data ; testing model preformance

results = fit_model.transform(test_data) 

In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [52]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol= 'prediction' , labelCol= 'Survived')

In [57]:
results.select('Survived' , 'prediction').show(15)

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 15 rows



In [58]:
# evaluate 

AUC = my_eval.evaluate(results) 

In [61]:
# Area Under the Curve

AUC

0.7695652173913043

In [62]:
# We can explain %77 of the servival 