# Lab 3
- Structured API Approach

In [8]:
from pyspark.sql import SparkSession
import os

# Set JAVA_HOME explicitly
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-21-openjdk-amd64'

spark = SparkSession.builder \
    .appName("MySparkApp") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

In [9]:
spark   

In [10]:
df = spark.read.format('csv').load("/home/aaronpham/Coding/bigdata/spark/spark_mllib/data/creditcard.csv", header=True, inferSchema=True)

df.printSchema()

[Stage 10:>                                                         (0 + 8) / 8]

root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double (nulla

                                                                                

In [11]:
df.show(5)

+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|                V22|     

In [12]:
df.count()

284807

## Split the dataset into train & test

In [15]:
# split dataset into train and test
train, test = df.randomSplit([0.8, 0.2])

train, test

(DataFrame[Time: double, V1: double, V2: double, V3: double, V4: double, V5: double, V6: double, V7: double, V8: double, V9: double, V10: double, V11: double, V12: double, V13: double, V14: double, V15: double, V16: double, V17: double, V18: double, V19: double, V20: double, V21: double, V22: double, V23: double, V24: double, V25: double, V26: double, V27: double, V28: double, Amount: double, Class: int],
 DataFrame[Time: double, V1: double, V2: double, V3: double, V4: double, V5: double, V6: double, V7: double, V8: double, V9: double, V10: double, V11: double, V12: double, V13: double, V14: double, V15: double, V16: double, V17: double, V18: double, V19: double, V20: double, V21: double, V22: double, V23: double, V24: double, V25: double, V26: double, V27: double, V28: double, Amount: double, Class: int])

## Replace null value with average value

In [17]:
# Replace the null value with average value
from pyspark.ml.feature import Imputer

numerical_features_lst = train.columns

imputer = Imputer(inputCols=numerical_features_lst, outputCols=numerical_features_lst)

imputer = imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(test)

train.show(3)

                                                                                

+----+------------------+------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+
|Time|                V1|                V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|              V12|              V13|               V14|               V15|              V16|               V17|               V18|               V19|                V20|               V21|                V22|               V23|

## Aggregate all columns into one features

In [14]:
from pyspark.ml.feature import StandardScaler, VectorAssembler

# Pick all features into one feature
input_cols = [c for c in df.columns if c != "Class"]

# Assemble into 1 vectors
assembler = VectorAssembler(inputCols=input_cols, outputCol="features_assembled")

In [18]:
train = assembler.transform(train)
test = assembler.transform(test)

train.show(2)

+----+-----------------+-----------------+----------------+-----------------+------------------+-------------------+-------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+--------------------+
|Time|               V1|               V2|              V3|               V4|                V5|                 V6|                 V7|                V8|                V9|               V10|              V11|              V12|              V13|               V14|              V15|              V16|               V17|               V18|               V19|                V20|               V21|               V22|              V

## Standardize the dataset
- Format the data in to a fixed range for further ML processing

For each value it will apply the following algorithms

```
scaled_value = (original_value - mean) / standard_deviation
```

In [20]:
# scale the features 
scaler = StandardScaler(inputCol="features_assembled", outputCol="features", withStd=True, withMean=True)

scaler_model = scaler.fit(train)

train = scaler_model.transform(train)
test = scaler_model.transform(test)

train.show(3)

                                                                                

+----+------------------+------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+--------------------+--------------------+
|Time|                V1|                V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|              V12|              V13|               V14|               V15|              V16|               V17|               V18|               V19|                V20|               V

In [21]:
train.select('features').take(3)

[Row(features=DenseVector([-1.9968, 0.6105, 0.1607, 0.1089, 0.3159, 0.0418, -0.0613, -0.0634, 0.0713, -0.2325, -0.1534, 1.5793, 1.0668, 0.4932, -0.1485, 0.6941, 0.53, -0.1346, -0.2195, -0.1778, -0.0909, -0.3072, -0.8787, 0.1604, -0.5605, 0.3215, 0.2611, -0.0231, 0.0445, -0.3511])),
 Row(features=DenseVector([-1.9967, -0.6965, -0.8144, 1.1744, 0.2676, -0.3723, 1.3608, 0.656, 0.208, -1.3801, 0.1913, 0.6111, 0.066, 0.7227, -0.1716, 2.5627, -3.2995, 1.3027, -0.1456, -2.7779, 0.6963, 0.3358, 1.0637, 1.4476, -1.1371, -0.6268, -0.2886, -0.1404, -0.1845, 1.2019])),
 Row(features=DenseVector([-1.9967, -0.4956, -0.1133, 1.1875, -0.6106, -0.0099, 0.9429, 0.1982, 0.3171, -1.2638, -0.0504, -0.2228, 0.1784, 0.512, -0.2989, -0.6901, -1.2096, -0.8026, 2.3425, -1.5132, -0.2751, -0.1478, 0.0082, -0.3041, -1.9395, 1.2418, -0.4605, 0.1584, 0.1881, 0.1479]))]

# Logistic Regression 

In [22]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="Class")

lr


LogisticRegression_bb453e5fae22

In [26]:
model = lr.fit(train)

pred_train_df = model.transform(train)

pred_train_df.show(5)

                                                                                

+----+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+-------------------+------------------+-------------------+-------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|Time|                V1|                V2|                V3|                V4|                 V5|                 V6|                  V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|                V17|     

## Prediction

In [29]:
pred_test_df = model.transform(test)
pred_test_df.show(10)

+----+------------------+-------------------+-----------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+------------------+--------------------+------------------+-------------------+------------------+------------------+-------------------+--------------------+-------------------+------------------+--------------------+--------------------+------------------+-------------------+-------------------+------------------+-------------------+--------------------+-------------------+------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|Time|                V1|                 V2|               V3|                V4|                V5|                V6|                 V7|                 V8|                 V9|               V10|                 V11|               V12|                V13|               V14|               V15|                V16|                

## Evaluation

In [31]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# AUC
evaluator_auc = BinaryClassificationEvaluator(labelCol="Class", metricName="areaUnderROC")
auc = evaluator_auc.evaluate(pred_test_df)

# Accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction", metricName="accuracy")
acc = evaluator_acc.evaluate(pred_test_df)

# Precision & Recall
precision = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction", metricName="weightedPrecision").evaluate(pred_test_df)
recall = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction", metricName="weightedRecall").evaluate(pred_test_df)

print(f"AUC = {auc:.4f}")
print(f"Accuracy = {acc:.4f}")
print(f"Precision = {precision:.4f}")
print(f"Recall = {recall:.4f}")


                                                                                

AUC = 0.9691
Accuracy = 0.9990
Precision = 0.9990
Recall = 0.9990


In [32]:
print("Intercept:", model.interceptVector)
print("Coefficients:", model.coefficients)

Intercept: [-8.762892411866266]
Coefficients: [-0.2377112758126441,0.12466650976678181,0.11301280013159823,-0.014723073583710293,0.977666434977012,0.2554605246043406,-0.16224850717212916,-0.039912944104939355,-0.24416454647937777,-0.20809875749967102,-0.8666080659021966,-0.045763782487315774,0.10395640623884463,-0.39200373084260887,-0.5691266142242114,-0.11328985531534047,-0.2086337485952307,-0.07026773236768571,0.008359248721559498,0.1210412894507246,-0.3431679428379434,0.25107736908877,0.47481525624955445,-0.07844064875988077,-0.036886516497943535,0.012972235567213391,0.05527194230997747,-0.3340523794942354,-0.14545942202324758,0.2901563347243217]


## Save results

In [None]:
pred_test_df.select("Class", "prediction", "probability") \
    .toPandas().to_csv("/home/aaronpham/Coding/bigdata/spark/spark_mllib/results/Classification_Structured.csv", index=False)

                                                                                

# Random Forest Classifier