# Problem1_Classification - Structured_API

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import col, sum
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql import functions as F

## Tạo session

In [2]:
spark = SparkSession.builder.appName("BD_Lab03_1.1").getOrCreate()

## Nhập dữ liệu

In [3]:
path = "/content/creditcard.csv"
data = spark.read.csv(path, header=True, inferSchema=True)

In [4]:
data.show(5)

+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|                V22|     

## Loại bỏ những cột có giá trị bị thiếu và các cột bị trùng lắp

In [5]:
data.count()

284807

In [6]:
data.dropna().count()

284807

In [7]:
data.dropDuplicates().count()

283726

## Đếm số lượng mỗi lớp

In [8]:
data.groupBy("Class").count().show()

+-----+------+
|Class| count|
+-----+------+
|    1|   492|
|    0|284315|
+-----+------+



Bởi vì số lượng chênh lệch giữa 2 class nên chỉ lấy 500 mẫu class 0.

In [9]:
data_0 = data.filter(col("Class") == 0).orderBy(F.rand(seed=42)).limit(500)
data_1 = data.filter(col("Class") == 1)
data = data_0.union(data_1)
data = data.orderBy(F.rand())

In [10]:
data.groupBy("Class").count().show()

+-----+-----+
|Class|count|
+-----+-----+
|    0|  500|
|    1|  492|
+-----+-----+



## Kết hợp các cột số thành 1 cột vector duy nhất

In [11]:
assembler = VectorAssembler(inputCols=data.columns[1:-1], outputCol="raw_features")
data = assembler.transform(data)

## Chuẩn hoá vector

In [12]:
scaler = StandardScaler(inputCol="raw_features", outputCol="features", withMean=True, withStd=True)
scalerModel = scaler.fit(data)
data = scalerModel.transform(data)

data = data.select("features", "Class")

## chia dữ liệu thành 2 phần train, test

In [13]:
train, test = data.randomSplit([0.8, 0.2], seed=42)

## Sử dụng LogisticRegression

In [14]:
lr = LogisticRegression(featuresCol="features", labelCol="Class")
model = lr.fit(train)

## Đánh giá mô hình

In [15]:
summary = model.summary

coefficients = model.coefficients
feature_names = assembler.getInputCols()

print("Coefficients:")
for name, coef in zip(feature_names, coefficients):
    print(f"  {name:<20}: {coef:6f}")

print(f"Intercept: {model.intercept:6f}")

Coefficients:
  V1                  : 0.272778
  V2                  : 0.194965
  V3                  : -0.074244
  V4                  : 2.313173
  V5                  : 0.377402
  V6                  : -0.559871
  V7                  : 0.498493
  V8                  : -1.895221
  V9                  : -0.302441
  V10                 : -1.119511
  V11                 : 0.907764
  V12                 : -3.060434
  V13                 : -0.306947
  V14                 : -2.449704
  V15                 : -0.143571
  V16                 : -1.055626
  V17                 : 0.705810
  V18                 : 0.209459
  V19                 : 0.219280
  V20                 : -0.212172
  V21                 : 0.739707
  V22                 : 0.531503
  V23                 : -0.070085
  V24                 : -0.046514
  V25                 : 0.176367
  V26                 : -0.393871
  V27                 : -0.206658
  V28                 : 0.344169
  Amount              : 0.205148
Intercept: 3.9

In [16]:
predictions = model.transform(test)

In [None]:
eval_acc = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction", metricName="accuracy")
eval_precision = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction", metricName="weightedPrecision")
eval_recall = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction", metricName="weightedRecall")
eval_auc = BinaryClassificationEvaluator(labelCol="Class", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

accuracy = eval_acc.evaluate(predictions)
precision = eval_precision.evaluate(predictions)
recall = eval_recall.evaluate(predictions)
auc = eval_auc.evaluate(predictions)


In [18]:
print(f"Accuracy: {accuracy:6f}")
print(f"Precision: {precision:6f}")
print(f"Recall: {recall:6f}")
print(f"AUC: {auc:6f}")

Accuracy: 0.943396
Precision: 0.943677
Recall: 0.943396
AUC: 0.975073
