# Project Streamlit
- modeling the Titanic dataset with **Pyspark**

- Course Name :         Applied Machine Learning
- Course instructor:    Sohail Tehranipour
- Student Name :        Afshin Masoudi Ashtiani
- Chapter 7 -           Building a Web App for Data Scientists
- Project:              Streamlit Project
- Date :                September 2024

## Step 1: Install required libraries

In [5]:
!pip install pyspark



## Step 2: Import required libraries



In [6]:
import pyspark
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

## Step 3: Start the Spark Session

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pyspark_titanic_model').getOrCreate()
spark

## Step 4: Load the dataset

In [8]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
df = spark.read.csv('/content/drive/My Drive/Applied Machine Learning/Datasets/titanic_train.csv', header= True, inferSchema= True)
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [10]:
df.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                NULL|  NULL| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [11]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [12]:
df.dtypes

[('PassengerId', 'int'),
 ('Survived', 'int'),
 ('Pclass', 'int'),
 ('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'double'),
 ('SibSp', 'int'),
 ('Parch', 'int'),
 ('Ticket', 'string'),
 ('Fare', 'double'),
 ('Cabin', 'string'),
 ('Embarked', 'string')]

## Step 5: Import VectorAssembler module

In [13]:
from pyspark.ml.feature import VectorAssembler
cols = ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
assembler_data = VectorAssembler(inputCols=cols, outputCol="features", handleInvalid="skip")
output = assembler_data.transform(df)
output.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+--------------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|            features|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+--------------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|[0.0,3.0,22.0,1.0...|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|[1.0,1.0,38.0,1.0...|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|[1.0,3.0,26.0,0.0...|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|[1.0,1.0,35.0,1.0...|
|          5|       0|     3|Allen, Mr. Willia..

In [14]:
final_df = output.select('features','Survived')
final_df.show(5)

+--------------------+--------+
|            features|Survived|
+--------------------+--------+
|[0.0,3.0,22.0,1.0...|       0|
|[1.0,1.0,38.0,1.0...|       1|
|[1.0,3.0,26.0,0.0...|       1|
|[1.0,1.0,35.0,1.0...|       1|
|[0.0,3.0,35.0,0.0...|       0|
+--------------------+--------+
only showing top 5 rows



## Step 6: Split the data into the train and test sets

In [15]:
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=17)

# Check that training set has around 80% of records
training_ratio = train_df.count() / final_df.count()
print(training_ratio)

0.7997198879551821


## Step 7: Build Logistic Regression model using PySpark

In [16]:
from pyspark.ml.classification import LogisticRegression

# Create classifier object and train on training data
lr_model = LogisticRegression(featuresCol='features', labelCol='Survived').fit(train_df)
lr_model.summary.predictions.show(5)


+--------------------+--------+--------------------+--------------------+----------+
|            features|Survived|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|(6,[1,2],[1.0,39.0])|     0.0|[18.6904721683158...|[0.99999999236461...|       0.0|
|(6,[1,2],[1.0,40.0])|     0.0|[18.7044749124899...|[0.99999999247078...|       0.0|
|(6,[1,2],[3.0,19.0])|     0.0|[19.1159503535575...|[0.99999999501060...|       0.0|
|(6,[1,2],[3.0,36.0])|     0.0|[19.3539970045174...|[0.99999999606752...|       0.0|
|(6,[1,2],[3.0,49.0])|     0.0|[19.5360326787808...|[0.99999999672200...|       0.0|
+--------------------+--------+--------------------+--------------------+----------+
only showing top 5 rows



In [17]:
lr_model.summary.predictions.describe().show()

+-------+------------------+------------------+
|summary|          Survived|        prediction|
+-------+------------------+------------------+
|  count|               571|               571|
|   mean|0.4098073555166375|0.4098073555166375|
| stddev|0.4922292270333013|0.4922292270333013|
|    min|               0.0|               0.0|
|    max|               1.0|               1.0|
+-------+------------------+------------------+



In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
test_pred_df = lr_model.evaluate(test_df)
test_pred_df.predictions.show(5)

+--------------------+--------+--------------------+--------------------+----------+
|            features|Survived|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|(6,[1,2],[1.0,38.0])|       0|[18.6764694241416...|[0.99999999225694...|       0.0|
|[0.0,1.0,2.0,1.0,...|       0|[17.8292428349916...|[0.99999998193415...|       0.0|
|[0.0,1.0,22.0,0.0...|       0|[18.2450319489414...|[0.99999998807980...|       0.0|
|[0.0,1.0,30.0,0.0...|       0|[18.5220156230867...|[0.99999999096369...|       0.0|
|[0.0,1.0,31.0,0.0...|       0|[18.5012383196734...|[0.99999999077398...|       0.0|
+--------------------+--------+--------------------+--------------------+----------+
only showing top 5 rows



In [19]:
eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')
auc = eval.evaluate(test_pred_df.predictions)
auc

1.0