# Low_Level

In [1]:
import findspark
findspark.init()

## Thêm các thư viện cần thiết

In [2]:
import pyspark
from pyspark.sql import SparkSession

import math
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

## Tạo Session

In [3]:
spark=SparkSession.builder.master("local")\
    .appName("low_level")\
    .config("spark.some.config.option", "some-value")\
    .getOrCreate()

## Đọc dữ liệu

In [4]:
lines=spark.sparkContext.textFile("hdfs://localhost:9000/lab03/creditcard.csv")
lines.count()

284808

In [5]:
header=lines.first()
raw_data=lines.filter(lambda line: line !=header)
data=raw_data.map(lambda lines: lines.split(","))\
            .map(lambda line: tuple([float(x) for x in line[0:-1]]+[float(line[-1].strip('"'))]))

In [6]:
for row in data.take(5):
    print(row)

(0.0, -1.3598071336738, -0.0727811733098497, 2.53634673796914, 1.37815522427443, -0.338320769942518, 0.462387777762292, 0.239598554061257, 0.0986979012610507, 0.363786969611213, 0.0907941719789316, -0.551599533260813, -0.617800855762348, -0.991389847235408, -0.311169353699879, 1.46817697209427, -0.470400525259478, 0.207971241929242, 0.0257905801985591, 0.403992960255733, 0.251412098239705, -0.018306777944153, 0.277837575558899, -0.110473910188767, 0.0669280749146731, 0.128539358273528, -0.189114843888824, 0.133558376740387, -0.0210530534538215, 149.62, 0.0)
(0.0, 1.19185711131486, 0.26615071205963, 0.16648011335321, 0.448154078460911, 0.0600176492822243, -0.0823608088155687, -0.0788029833323113, 0.0851016549148104, -0.255425128109186, -0.166974414004614, 1.61272666105479, 1.06523531137287, 0.48909501589608, -0.143772296441519, 0.635558093258208, 0.463917041022171, -0.114804663102346, -0.183361270123994, -0.145783041325259, -0.0690831352230203, -0.225775248033138, -0.638671952771851, 0.

## Tiền xử lý dữ liệu

### Bỏ dữ liệu lặp

In [7]:
data=data.distinct()
data.count()

283726

### Bỏ dữ liệu trống

In [8]:
data = data.filter(lambda line: all(x is not None and x != '' for x in line))
data.count()

283726

## Chuẩn hóa dữ liệu Min-Max

In [9]:
features = data.map(lambda line: line[0:-1])

min_features=features.reduce(lambda x, y: [min(x[i], y[i]) for i in range(len(x))])
max_features=features.reduce(lambda x, y: [max(x[i], y[i]) for i in range(len(x))])

normalized_data = data.map(lambda line: tuple([(line[i] - min_features[i]) / (max_features[i] - min_features[i]) for i in range(len(line)-1)]+[line[-1]]))

for row in normalized_data.take(5):
    print(row)

(0.0, 0.978541954971695, 0.7700666508227654, 0.8402984903939014, 0.2717964907547009, 0.7661203363388934, 0.2621916978704357, 0.26487543874149616, 0.7862983529047245, 0.4539809683822362, 0.5052673462220311, 0.38118772246581145, 0.7443415693042709, 0.48619017593610825, 0.6412190072734594, 0.3838396643725496, 0.46410517798669204, 0.7277939830919614, 0.6406810941347436, 0.5519304220394026, 0.5795297525747019, 0.5578399149746115, 0.4802369598542956, 0.6669378230986288, 0.33643999609599445, 0.5872902523783181, 0.446012969158175, 0.4163451447884128, 0.31342266347556097, 0.00010470527605604418, 0.0)
(5.787304967822585e-06, 0.9352170233299468, 0.7531176669488862, 0.8681408192619086, 0.26876550734448534, 0.7623287857209992, 0.28112212055047436, 0.27017718255653134, 0.788042262834494, 0.41060274137949276, 0.5130180380913919, 0.32242211351494776, 0.7066833600612961, 0.5038542274352835, 0.6404734520442342, 0.5116969543365681, 0.3574426288295902, 0.7633809907036572, 0.644945381986696, 0.386683126520

## Chuyển thành đầu vào hợp lý ([features],label) và thêm bias

In [10]:
rdd_data= normalized_data.map(lambda line: ([1.0]+[x for x in line[:-1]], line[-1]))

for row in rdd_data.take(5):
    print(row)

([1.0, 0.0, 0.978541954971695, 0.7700666508227654, 0.8402984903939014, 0.2717964907547009, 0.7661203363388934, 0.2621916978704357, 0.26487543874149616, 0.7862983529047245, 0.4539809683822362, 0.5052673462220311, 0.38118772246581145, 0.7443415693042709, 0.48619017593610825, 0.6412190072734594, 0.3838396643725496, 0.46410517798669204, 0.7277939830919614, 0.6406810941347436, 0.5519304220394026, 0.5795297525747019, 0.5578399149746115, 0.4802369598542956, 0.6669378230986288, 0.33643999609599445, 0.5872902523783181, 0.446012969158175, 0.4163451447884128, 0.31342266347556097, 0.00010470527605604418], 0.0)
([1.0, 5.787304967822585e-06, 0.9352170233299468, 0.7531176669488862, 0.8681408192619086, 0.26876550734448534, 0.7623287857209992, 0.28112212055047436, 0.27017718255653134, 0.788042262834494, 0.41060274137949276, 0.5130180380913919, 0.32242211351494776, 0.7066833600612961, 0.5038542274352835, 0.6404734520442342, 0.5116969543365681, 0.3574426288295902, 0.7633809907036572, 0.644945381986696, 0

## Tách dữ liệu

In [11]:
train_0,test_0= rdd_data.filter(lambda line: line[-1] == 0.0).randomSplit([0.8, 0.2], seed=42)
train_1,test_1= rdd_data.filter(lambda line: line[-1] == 1.0).randomSplit([0.8, 0.2], seed=42)

train_data=train_0.union(train_1)
test_data=test_0.union(test_1)

## Xây dựng các hàm cần thiết cho mô hình

### Hàm sigmoid

In [12]:
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

### Hàm nhân trọng số với một dòng dữ liệu

In [13]:
def dot(weights, features):
    return sum(w*x for w, x in zip(weights, features))

### Hàm tính gradient với một dòng dữ liệu

In [14]:
def gradient(weights,features,label):
    prediction = sigmoid(dot(weights, features))
    error = prediction - label
    return [error * x for x in features]

### Hàm train mô hình

In [15]:
def train_logistic_regression(data, num_iterations, learning_rate):
    weights = [0.0] * (len(data.first()[0])+1)
    count= data.count()
    for _ in range(num_iterations):
        print("Iteration: ", _ +1)
        gradients = data.map(lambda x: gradient(weights, x[0], x[1])).reduce(lambda x, y: [a+b for a, b in zip(x, y)])

        gradients = [g / count for g in gradients]

        weights = [w - learning_rate * g for w, g in zip(weights, gradients)]


    return weights

### Hàm dự đoán 

In [16]:
def predict(weights,features):
    return 1 if sigmoid(dot(weights, features)) >= 0.5 else 0

### Hàm đánh giá mô hình

In [17]:

def evaluate_metric(data,weights,label):
    predictions = data.map(lambda x: (predict(weights, x[0]), x[1]))
    tp = predictions.filter(lambda x: x[0] == label and x[1] == label).count()
    tn = predictions.filter(lambda x: x[0] != label and x[1] != label).count()
    fp = predictions.filter(lambda x: x[0] == label and x[1] != label).count()
    fn = predictions.filter(lambda x: x[0] != label and x[1] == label).count()

    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall= tp / (tp + fn) if (tp + fn) > 0 else 0

    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return accuracy, precision, recall, f1_score

def evaluate_model(data,weights):
    
    accuracy_0, precision_0, recall_0, f1_score_0 = evaluate_metric(data,weights,0.0)
    accuracy_1, precision_1, recall_1, f1_score_1 = evaluate_metric(data,weights,1.0)

    return (accuracy_0, precision_0, recall_0, f1_score_0), (accuracy_1, precision_1, recall_1, f1_score_1)


## Thực hiện huấn luyện mô hình

In [18]:
weights= train_logistic_regression(train_data, num_iterations=100, learning_rate=0.5)

Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25
Iteration:  26
Iteration:  27
Iteration:  28
Iteration:  29
Iteration:  30
Iteration:  31
Iteration:  32
Iteration:  33
Iteration:  34
Iteration:  35
Iteration:  36
Iteration:  37
Iteration:  38
Iteration:  39
Iteration:  40
Iteration:  41
Iteration:  42
Iteration:  43
Iteration:  44
Iteration:  45
Iteration:  46
Iteration:  47
Iteration:  48
Iteration:  49
Iteration:  50
Iteration:  51
Iteration:  52
Iteration:  53
Iteration:  54
Iteration:  55
Iteration:  56
Iteration:  57
Iteration:  58
Iteration:  59
Iteration:  60
Iteration:  61
Iteration:  62
Iteration:  63
Iteration:  64
Iteration:  65
Iteration:  66
Iteration:  67
Iter

In [19]:
print("Weights: ", weights)

Weights:  [-0.5577639978785078, -0.30694549440031543, -0.5408651258524305, -0.4247669153042289, -0.4772513021873141, -0.12363960575701975, -0.4287851226174678, -0.14793041617408192, -0.15070090842031222, -0.4371641346338713, -0.26538898004604256, -0.2932234701854491, -0.14085722264095182, -0.41188768281612037, -0.25020860759978314, -0.37949833572669023, -0.18832219766500932, -0.2614502372595968, -0.42373956746939784, -0.37659174032415665, -0.30918282769040883, -0.3232941678156191, -0.3124674197063147, -0.28394373127893724, -0.37120163015368707, -0.21386204572836137, -0.3222649417113173, -0.23632189520566432, -0.23205062558936432, -0.17452512075003157, -0.0018464217114562117]


## Đánh giá mô hình sau khi huấn luyện

In [20]:
evaluation_0,evaluation_1= evaluate_model(train_data,weights)

In [21]:
print("Accuracy: ", evaluation_0[0])

print("Precision [0,1]: ",[evaluation_0[1],evaluation_1[1]])
print("Recall [0,1]: ",[evaluation_0[2],evaluation_1[2]])
print("F1 Score [0,1]: ",[evaluation_0[3],evaluation_1[3]])

Accuracy:  0.9983082209886334
Precision [0,1]:  [0.9983082209886334, 0]
Recall [0,1]:  [1.0, 0.0]
F1 Score [0,1]:  [0.9991533943594899, 0]


## Thực hiện dự đoán trên tập test

In [22]:
predictions = test_data.map(lambda x: (predict(weights, x[0]), x[1]))


## Đánh giá trên tập test

In [23]:
accuracy_0, precision_0, recall_0, f1_score_0 = evaluate_metric(test_data,weights,0.0)
accuracy_1, precision_1, recall_1, f1_score_1 = evaluate_metric(test_data,weights,1.0)

print("Accuracy: ", accuracy_0)
print("Precision [0,1]: ", [precision_0,precision_1])
print("Recall [0,1]: ", [recall_0,recall_1])
print("F1 Score [0,1]: ", [f1_score_0,f1_score_1])

Accuracy:  0.998431607514186
Precision [0,1]:  [0.998431607514186, 0]
Recall [0,1]:  [1.0, 0.0]
F1 Score [0,1]:  [0.9992151883107149, 0]


## Lưu kết quả

In [24]:
result= test_data.map(lambda x: (x[0],x[1],predict(weights, x[0]))).toDF(["features","label","prediction"])
result.show(5)

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|[1.0, 6.944765961...|  0.0|         0|
|[1.0, 6.944765961...|  0.0|         0|
|[1.0, 1.273207092...|  0.0|         0|
|[1.0, 1.331080142...|  0.0|         0|
|[1.0, 1.504699291...|  0.0|         0|
+--------------------+-----+----------+
only showing top 5 rows



In [25]:
result.coalesce(1).write.mode("overwrite").parquet("results")