<a href="https://colab.research.google.com/github/90485462/ISOM676/blob/main/HW2_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install PySpark, import library:

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.3/spark-3.2.3-bin-hadoop2.7.tgz
!tar xf spark-3.2.3-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.3-bin-hadoop2.7"

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [35]:
# Imports
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

Load the training data:

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
# Load the CSV file
path = "/content/drive/MyDrive/titanic_train.csv"
df = spark.read.csv(path, header=True, inferSchema=True)


 Familiarize with the dataset:

In [8]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [9]:
df.show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [11]:
df.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [27]:
from pyspark.sql.functions import col

string_columns = [ "Sex", "Cabin", "Embarked"]

for column in string_columns:
    print(f"Top 10 most frequent values for {column}:")
    df.groupBy(column).count().orderBy(col("count").desc()).show(10)


Top 10 most frequent values for Sex:
+------+-----+
|   Sex|count|
+------+-----+
|  male|  577|
|female|  314|
+------+-----+

Top 10 most frequent values for Cabin:
+-----------+-----+
|      Cabin|count|
+-----------+-----+
|       null|  687|
|         G6|    4|
|    B96 B98|    4|
|C23 C25 C27|    4|
|    C22 C26|    3|
|        F33|    3|
|         F2|    3|
|          D|    3|
|       E101|    3|
|        C65|    2|
+-----------+-----+
only showing top 10 rows

Top 10 most frequent values for Embarked:
+--------+-----+
|Embarked|count|
+--------+-----+
|       S|  644|
|       C|  168|
|       Q|   77|
|    null|    2|
+--------+-----+



Features engineering:

In [28]:
# Selecting and Converting Numerical Columns to Double
columns = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
df = df.select([col(c).cast("double") if c in ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] else col(c) for c in columns])

In [29]:
from pyspark.sql.functions import mean, when

# Calculate the mean age
mean_age = df.select(mean(col("Age"))).collect()[0][0]

# Replace missing values with mean age and create an 'AgeNA' indicator column
df = df.withColumn("Age", when(col("Age").isNull(), mean_age).otherwise(col("Age")))
df = df.withColumn("AgeNA", when(col("Age").isNull(), 1).otherwise(0))

In [30]:
df.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: double (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: double (nullable = true)
 |-- Parch: double (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- AgeNA: integer (nullable = false)



In [31]:
# Print the revised DataFrame
df.show()

# Recalculate summary statistics
df.describe().show()

+--------+------+------+-----------------+-----+-----+-------+--------+-----+
|Survived|Pclass|   Sex|              Age|SibSp|Parch|   Fare|Embarked|AgeNA|
+--------+------+------+-----------------+-----+-----+-------+--------+-----+
|       0|   3.0|  male|             22.0|  1.0|  0.0|   7.25|       S|    0|
|       1|   1.0|female|             38.0|  1.0|  0.0|71.2833|       C|    0|
|       1|   3.0|female|             26.0|  0.0|  0.0|  7.925|       S|    0|
|       1|   1.0|female|             35.0|  1.0|  0.0|   53.1|       S|    0|
|       0|   3.0|  male|             35.0|  0.0|  0.0|   8.05|       S|    0|
|       0|   3.0|  male|29.69911764705882|  0.0|  0.0| 8.4583|       Q|    0|
|       0|   1.0|  male|             54.0|  0.0|  0.0|51.8625|       S|    0|
|       0|   3.0|  male|              2.0|  3.0|  1.0| 21.075|       S|    0|
|       1|   3.0|female|             27.0|  0.0|  2.0|11.1333|       S|    0|
|       1|   2.0|female|             14.0|  1.0|  0.0|30.0708|  

 Encode string and categorical variables

In [44]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# Sex
sex_indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex").fit(df)
sex_encoder = OneHotEncoder(inputCol="SexIndex", outputCol="SexVec")

# Embarked, skip the 2 nulls
embarked_indexer = StringIndexer(inputCol="Embarked", outputCol="EmbarkedIndex", handleInvalid="skip").fit(df)
embarked_encoder = OneHotEncoder(inputCol="EmbarkedIndex", outputCol="EmbarkedVec")

Assemble feature columns

In [45]:
from pyspark.ml.feature import VectorAssembler

featureCols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'SexVec', 'EmbarkedVec']

assembler = VectorAssembler(inputCols=featureCols, outputCol="Features")

Create model:

In [46]:
lr = LogisticRegression(featuresCol="Features", labelCol="Survived")

 Assemble pipeline:

In [47]:
pipeline = Pipeline(stages=[sex_indexer, sex_encoder, embarked_indexer, embarked_encoder, assembler, lr])

Split training, testing:

In [48]:
train_df, test_df = df.randomSplit([0.7, 0.3], seed=42)

print(f"Training Dataset Count: {train_df.count()}")
print(f"Test Dataset Count: {test_df.count()}")

Training Dataset Count: 659
Test Dataset Count: 232


Fit the model:

In [49]:
lrModel = pipeline.fit(train_df)

In [60]:
# Find whether "male" or "female" was coded as 1 for the SexVec
sex_indexer_model = lrModel.stages[0]
print(sex_indexer_model.labels)

['male', 'female']


Make prediction:

In [52]:
predictions = lrModel.transform(test_df)

In [57]:
model = lrModel.stages[-1]
coefficients = model.coefficients
feature_coefficients = zip(featureCols, coefficients)
for feature, coeff in feature_coefficients:
    print(f"{feature}: {coeff}")

Pclass: -1.222320879745617
Age: -0.044331402855583166
SibSp: -0.3845725589680685
Parch: -0.1609795984088879
Fare: 0.0015985666684226567
SexVec: -2.5911124684387445
EmbarkedVec: -0.4722956606101305


Performance evaluation:

In [71]:
predictions.select("Prediction", "Survived", "Probability", "rawPrediction").show(5)

+----------+--------+--------------------+--------------------+
|Prediction|Survived|         Probability|       rawPrediction|
+----------+--------+--------------------+--------------------+
|       1.0|       0|[0.08048738321473...|[-2.4357433234152...|
|       1.0|       0|[0.38446130061956...|[-0.4706546830555...|
|       1.0|       0|[0.25389321580763...|[-1.0779549638936...|
|       1.0|       0|[0.23396660283693...|[-1.1860473859670...|
|       1.0|       0|[0.44982177132360...|[-0.2013908372867...|
+----------+--------+--------------------+--------------------+
only showing top 5 rows



In [72]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="Survived", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"AUC: {auc}")

AUC: 0.8493678692568609
