# Project 2: Machine Learning Project with Mllib Pipline
Name: Awara Pirkhdrie, 
Date: 2024-02-29

In [2]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

'C:\\Spark\\spark-3.5.1-bin-hadoop3'

# Initialize SparkSession

In [3]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder \
        .master("local") \
        .appName("Titanic Data") \
        .getOrCreate()

In [7]:
spark

# Reading Data

• Dataset (Titanic)


In [57]:
# Creates DataFrame 'df' by reading a CSV file with Spark, including column names and specifying the file path.
df = (spark.read
        .format("csv")
        .option("header", "true")
        .load("./Dataset/titanic/train.csv"))

In [79]:
from pyspark.sql.functions import col

In [80]:
# Selects and type transforms columns from DataFrame for analysis.
dataset = df.select(col("Survived").cast("float"),
                    col("Pclass").cast("float"),
                    col("Sex"),
                    col("Age").cast("float"),
                    col("Fare").cast("float"),
                    col("Embarked"),
)

In [81]:
dataset.show(3)

+--------+------+------+----+-------+--------+
|Survived|Pclass|   Sex| Age|   Fare|Embarked|
+--------+------+------+----+-------+--------+
|     0.0|   3.0|  male|22.0|   7.25|       S|
|     1.0|   1.0|female|38.0|71.2833|       C|
|     1.0|   3.0|female|26.0|  7.925|       S|
+--------+------+------+----+-------+--------+
only showing top 3 rows



# Importing functions

In [60]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

# StringIndexer is similar to labelencoder which gives a label to each category
# OneHotEncoder created onehot encoding vector
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# VectorAssembler is used to create vector from the features. Modeling takes vector as an input 
from pyspark.ml.feature import VectorAssembler

# DecisionTreeClassifier is used for classiication problems
from pyspark.ml.classification import RandomForestClassifier


# Using pipeline

In [61]:
# Import pipeline from PySpark ML
from pyspark.ml import Pipeline

In [77]:
# Evaluator for multiclass classification with PySpark's ML library, set to Split data into training and test sets, remove NA values.
(train_df, test_df) = df.na.drop().randomSplit([0.8, 0.2], 11)
print("Number of train samples: " + str(train_df.count()))
print("Number of test samples: " + str(test_df.count()))

Number of train samples: 562
Number of test samples: 150


In [63]:
'''
Creates index transformers for gender and port of disembarkation, then collects all features including these transformed, 
to finally model with a random forest classifier.
'''
sex_indexer = StringIndexer(inputCol="Sex", outputCol="Gender")
embarked_indexer = StringIndexer(inputCol="Embarked", outputCol="Boarded")

# Assemble all the features with VectorAssembler
input_cols = ['Pclass', 'Age', 'Fare', 'Gender', 'Boarded']
output_col = "feature"
vector_assembler = VectorAssembler(inputCols=input_cols, outputCol=output_col)

# Modeling using RandomForestClassifier
dt_model = RandomForestClassifier(labelCol="Survived", featuresCol="feature")

In [64]:
'''
Configures a pipeline with preprocessing steps and model fitting, fits the model with training data, 
then makes predictions on test data and displays the first five results without truncation.
'''
# Setup the pipeline
pipeline = Pipeline(stages=[sex_indexer, embarked_indexer, vector_assembler, dt_model])

# Fit the Pipeline model
final_pipeline_model = pipeline.fit(train_df)

# Predict on test data
test_predictions = final_pipeline_model.transform(test_df)

test_predictions.show(5, truncate=False)

+--------+------+----+----+-------+--------+------+-------+-----------------------------------+--------------------------------------+---------------------------------------+----------+
|Survived|Pclass|Sex |Age |Fare   |Embarked|Gender|Boarded|feature                            |rawPrediction                         |probability                            |prediction|
+--------+------+----+----+-------+--------+------+-------+-----------------------------------+--------------------------------------+---------------------------------------+----------+
|0.0     |1.0   |male|19.0|263.0  |S       |0.0   |0.0    |[1.0,19.0,263.0,0.0,0.0]           |[11.485763833304809,8.514236166695195]|[0.5742881916652404,0.4257118083347597]|0.0       |
|0.0     |1.0   |male|21.0|77.2875|S       |0.0   |0.0    |[1.0,21.0,77.2874984741211,0.0,0.0]|[11.450676114006564,8.549323885993438]|[0.5725338057003282,0.4274661942996719]|0.0       |
|0.0     |1.0   |male|28.0|82.1708|C       |0.0   |1.0    |[1.0,28.0,8

In [82]:
# Selecting specific columns from the predictions shows the first three rows.
prediction_display_format = test_predictions[
    ["feature", "probability", "Survived", "prediction"]]

prediction_display_format.show(3, truncate=False)

+-----------------------------------+---------------------------------------+--------+----------+
|feature                            |probability                            |Survived|prediction|
+-----------------------------------+---------------------------------------+--------+----------+
|[1.0,19.0,263.0,0.0,0.0]           |[0.5742881916652404,0.4257118083347597]|0.0     |0.0       |
|[1.0,21.0,77.2874984741211,0.0,0.0]|[0.5725338057003282,0.4274661942996719]|0.0     |0.0       |
|[1.0,28.0,82.1707992553711,0.0,1.0]|[0.4760033364238634,0.5239966635761365]|0.0     |1.0       |
+-----------------------------------+---------------------------------------+--------+----------+
only showing top 3 rows



# Computing the Accuracy

In [76]:
# Imports evaluators, creates evaluators, calculates accuracy, prints training accuracy.
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='Survived', 
                                              predictionCol='prediction', 
                                              metricName='accuracy')

accuracy = evaluator.evaluate(test_predictions)

print("Training Accuracy: ", accuracy)


Training Accuracy:  0.78


# Checking Spark jobs

After all you can check Spark Jobs on your local machine and manage them

![](images\5.png)

![](images\6.png)