# Problem1_Classification - Structured_API

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import col, sum
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql import functions as F

## Tạo session

In [2]:
spark = SparkSession.builder.appName("BD_Lab03_1.1").getOrCreate()

## Nhập dữ liệu

In [3]:
path = "/content/creditcard.csv"
data = spark.read.csv(path, header=True, inferSchema=True)

In [4]:
data.show(5)

+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|                V22|     

## Loại bỏ những cột có giá trị bị thiếu và các cột bị trùng lắp

In [5]:
data.count()

284807

In [6]:
data = data.dropna()

In [7]:
data = data.dropDuplicates()

In [8]:
data.count()

283726

## Đếm số lượng mỗi lớp

In [9]:
data.groupBy("Class").count().show()

+-----+------+
|Class| count|
+-----+------+
|    1|   473|
|    0|283253|
+-----+------+



Bởi vì số lượng chênh lệch giữa 2 class nên chỉ lấy 500 mẫu class 0.

In [10]:
data_0 = data.filter(col("Class") == 0).orderBy(F.rand(seed=42)).limit(500)
data_1 = data.filter(col("Class") == 1)
data = data_0.union(data_1)
data = data.orderBy(F.rand())

In [11]:
data.groupBy("Class").count().show()

+-----+-----+
|Class|count|
+-----+-----+
|    0|  500|
|    1|  473|
+-----+-----+



## Kết hợp các cột số thành 1 cột vector duy nhất

In [12]:
assembler = VectorAssembler(inputCols=data.columns[1:-1], outputCol="raw_features")
data = assembler.transform(data)

## Chuẩn hoá vector

In [13]:
scaler = StandardScaler(inputCol="raw_features", outputCol="features", withMean=True, withStd=True)
scalerModel = scaler.fit(data)
data = scalerModel.transform(data)

data = data.select("features", "Class")

## chia dữ liệu thành 2 phần train, test

In [14]:
train, test = data.randomSplit([0.8, 0.2], seed=42)

## Sử dụng LogisticRegression

In [15]:
lr = LogisticRegression(featuresCol="features", labelCol="Class")
model = lr.fit(train)

## Đánh giá mô hình

In [16]:
summary = model.summary

coefficients = model.coefficients
feature_names = assembler.getInputCols()

print("Coefficients:")
for name, coef in zip(feature_names, coefficients):
    print(f"  {name:<20}: {coef:6f}")

print(f"Intercept: {model.intercept:6f}")

Coefficients:
  V1                  : 15.441215
  V2                  : 21.841188
  V3                  : 9.164921
  V4                  : 0.989537
  V5                  : 14.974401
  V6                  : -4.807354
  V7                  : -33.108256
  V8                  : 3.584565
  V9                  : -1.658084
  V10                 : -5.018617
  V11                 : 5.815226
  V12                 : -18.650937
  V13                 : -0.778922
  V14                 : -23.977445
  V15                 : -0.226321
  V16                 : -13.191995
  V17                 : -32.486199
  V18                 : -5.031678
  V19                 : 2.140745
  V20                 : -3.876064
  V21                 : -0.966571
  V22                 : 2.290916
  V23                 : 5.062811
  V24                 : -0.228948
  V25                 : 1.366766
  V26                 : 0.094067
  V27                 : -0.382899
  V28                 : 1.447843
  Amount              : 13.500196
Inter

In [17]:
predictions = model.transform(test)

In [18]:
eval_acc = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction", metricName="accuracy")
eval_precision = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction", metricName="weightedPrecision")
eval_recall = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction", metricName="weightedRecall")
eval_auc = BinaryClassificationEvaluator(labelCol="Class", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

accuracy = eval_acc.evaluate(predictions)
precision = eval_precision.evaluate(predictions)
recall = eval_recall.evaluate(predictions)
auc = eval_auc.evaluate(predictions)


In [19]:
print(f"Accuracy: {accuracy:6f}")
print(f"Precision: {precision:6f}")
print(f"Recall: {recall:6f}")
print(f"AUC: {auc:6f}")

Accuracy: 0.949045
Precision: 0.949480
Recall: 0.949045
AUC: 0.993268
