# Project 2: Machine Learning Project with Mllib Pipline


## Initialize SparkSession & Imports


In [16]:
# Import the necessary libraries
from pyspark.sql import SparkSession as sps, functions as F, types as T

from pyspark.ml import Pipeline as P

from pyspark.ml.feature import (
    StringIndexer as si,
    OneHotEncoder as ohe,
    VectorAssembler as va,
)

from pyspark.ml.classification import RandomForestClassifier as rfc

# Instantiate a Spark Session
spark_session_p2 = sps.builder.master("local").appName("Titanic Data").getOrCreate()

spark_session_p2

## Reading the Data


In [17]:
df = (
    spark_session_p2.read.format("csv")
    .option("header", "true")
    .load("./data/titanic/train.csv")
)

# Translate the string values into numeric (float) values
df = df.select(
    F.col("Survived").cast("float"),
    F.col("Pclass").cast("float"),
    F.col("Sex"),
    F.col("Age").cast("float"),
    F.col("Fare").cast("float"),
    F.col("Embarked"),
)

# Display the first five rows
df.show(5)

+--------+------+------+----+-------+--------+
|Survived|Pclass|   Sex| Age|   Fare|Embarked|
+--------+------+------+----+-------+--------+
|     0.0|   3.0|  male|22.0|   7.25|       S|
|     1.0|   1.0|female|38.0|71.2833|       C|
|     1.0|   3.0|female|26.0|  7.925|       S|
|     1.0|   1.0|female|35.0|   53.1|       S|
|     0.0|   3.0|  male|35.0|   8.05|       S|
+--------+------+------+----+-------+--------+
only showing top 5 rows



## Using Pipeline


### Data pre-processing and features preparation


In [18]:
# Split the complete data into training and testing subsets
train_df, test_df = df.na.drop().randomSplit([0.8, 0.2], 11)
print(f"Number of train samples: {str(train_df.count())}")
print(f"Number of test samples: {str(test_df.count())}")

# Label Encoding for the categorical variables "Sex" and "Embarked" without ".fit" or ".transform"
sex_indexer = si(inputCol="Sex", outputCol="Gender")
embarked_indexer = si(inputCol="Embarked", outputCol="Boarded")

# Assemble all the features with VectorAssembler
input_cols = ["Pclass", "Age", "Fare", "Gender", "Boarded"]
output_col = "feature"
vector_assembler = va(inputCols=input_cols, outputCol=output_col)

# Instantiate the Random Forest Classifier model
rf = rfc(labelCol="Survived", featuresCol=output_col, maxDepth=5)

Number of train samples: 562
Number of test samples: 150


### Pipeline declaration, stages transformers-setup, pipeline execution


In [19]:
# Setup the pipeline
pipe = P(stages=[sex_indexer, embarked_indexer, vector_assembler, rf])

# Fit the Pipeline model
pipeline = pipe.fit(train_df)

# Predict on test data
test_predictions = pipeline.transform(test_df)

# Display the predictions head
test_predictions.show(5, truncate=False)

+--------+------+----+----+-------+--------+------+-------+-----------------------------------+--------------------------------------+----------------------------------------+----------+
|Survived|Pclass|Sex |Age |Fare   |Embarked|Gender|Boarded|feature                            |rawPrediction                         |probability                             |prediction|
+--------+------+----+----+-------+--------+------+-------+-----------------------------------+--------------------------------------+----------------------------------------+----------+
|0.0     |1.0   |male|19.0|263.0  |S       |0.0   |0.0    |[1.0,19.0,263.0,0.0,0.0]           |[12.37969443779637,7.620305562203627] |[0.6189847218898187,0.38101527811018143]|0.0       |
|0.0     |1.0   |male|21.0|77.2875|S       |0.0   |0.0    |[1.0,21.0,77.2874984741211,0.0,0.0]|[10.511151869253803,9.488848130746197]|[0.5255575934626902,0.47444240653730985]|0.0       |
|0.0     |1.0   |male|28.0|82.1708|C       |0.0   |1.0    |[1.0,2

In [20]:
prediction_display_format = test_predictions[
    ["feature", "probability", "Survived", "prediction"]
]

prediction_display_format.show(5, truncate=False)

+-----------------------------------+----------------------------------------+--------+----------+
|feature                            |probability                             |Survived|prediction|
+-----------------------------------+----------------------------------------+--------+----------+
|[1.0,19.0,263.0,0.0,0.0]           |[0.6189847218898187,0.38101527811018143]|0.0     |0.0       |
|[1.0,21.0,77.2874984741211,0.0,0.0]|[0.5255575934626902,0.47444240653730985]|0.0     |0.0       |
|[1.0,28.0,82.1707992553711,0.0,1.0]|[0.45718019791138154,0.5428198020886185]|0.0     |1.0       |
|[1.0,29.0,30.0,0.0,0.0]            |[0.5588834768878976,0.4411165231121024] |0.0     |0.0       |
|[1.0,29.0,66.5999984741211,0.0,0.0]|[0.46551353124471495,0.534486468755285] |0.0     |1.0       |
+-----------------------------------+----------------------------------------+--------+----------+
only showing top 5 rows



### Computing the Accuracy


In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator as mcv

evaluator = mcv(labelCol="Survived", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(pipeline.transform(test_df))

print(f"Training set accuracy: {accuracy:.4f}")

Training set accuracy: 0.8000


### Stop the Spark Session


In [22]:
spark_session_p2.stop()