### Importing Libraries

In [1]:
from pyspark.sql import SparkSession
import findspark
findspark.init() # Initialize findspark to locate Spark
from pyspark.sql.window import Window

### Setup SparkSession

In [2]:
# Create a Spark session
# This is the entry point to programming Spark with the DataFrame API
# spark = SparkSession \
#     .builder \
#     .appName("Predictive Maintenance System") \
#     .getOrCreate()
# Clone về thì mọi người chạy code trên vì code dưới t avoid đi mấy cái warning

try:
    # Get active SparkSession if it exists
    active_spark = SparkSession.builder.getOrCreate()
    # Stop it
    active_spark.stop()
except:
    pass

# Create new session
spark = SparkSession \
    .builder \
    .appName("Model Development of Output1") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .config("spark.ui.showConsoleProgress", "false") \
    .config("spark.log.level", "ERROR") \
    .config("spark.driver.memory", "16g")  \
    .config("spark.executor.memory", "16g") \
    .config("spark.memory.fraction", "0.8") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
{"ts":"2024-10-20T13:33:29.489Z","level":"WARN","msg":"Your hostname, beboy-ThinkPad-E14-Gen-5, resolves to a loopback address: 127.0.1.1; using 192.168.1.14 instead (on interface wlp0s20f3)","context":{"host":"beboy-ThinkPad-E14-Gen-5","host_port":"127.0.1.1","host_port2":"192.168.1.14","network_if":"wlp0s20f3"},"logger":"Utils"}
{"ts":"2024-10-20T13:33:29.494Z","level":"WARN","msg":"Set SPARK_LOCAL_IP if you need to bind to another address","logger":"Utils"}
Using Spark's default log4j profile: org/apache/spark/log4j2-pattern-layout-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/20 20:33:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".


### Load and Read Dataset

In [3]:
# Read the dataset from a CSV file
df = spark.read.csv('/home/beboy/Documents/projects/Predictive-Maintenance-System-using-Apache-Spark/Models Development/dataset/final_data_update.csv', header=True, inferSchema=True)

# Show the first 5 rows of the DataFrame to verify the data
df.show(10)

+------------+--------------------+-----------+----------+---------+----------------+------------+-----------+---------+----------+-------------+----------+-------------+--------------------+---------------+------------+--------------------+-----------------------+---------------------+--------------------+---------+-----------+----------------+--------------------+-------------+--------------+---------+--------------+------------------+--------------------+---------------+---------------+--------------+-----------+------------+--------------------+-------------------+----------------+--------------------+----------------------+------------------+-------------------+------------------+-------------------+-----------------+---------------------------+--------------------------+-------------+-------------------+------------------------+--------------------------+--------------------+----------------------------+----------------------+
|equipment_id|           timestamp|temperature| vibra

In [4]:
df.printSchema()

root
 |-- equipment_id: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- temperature: double (nullable = true)
 |-- vibration: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- rotational_speed: double (nullable = true)
 |-- power_output: double (nullable = true)
 |-- noise_level: double (nullable = true)
 |-- voltage: double (nullable = true)
 |-- current: double (nullable = true)
 |-- oil_viscosity: double (nullable = true)
 |-- model: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- installation_date: timestamp (nullable = true)
 |-- max_temperature: double (nullable = true)
 |-- max_pressure: double (nullable = true)
 |-- max_rotational_speed: double (nullable = true)
 |-- expected_lifetime_years: double (nullable = true)
 |-- warranty_period_years: integer (nullable = true)
 |-- last_major_overhaul: timestamp (nullable = true)
 |-- location: string (nullable = true)
 |-- criticality: string (nullable = true)
 |

### Setup Condition
* Classify warnings based on device type refrigeration (cold) or heat generation (heat) to provide appropriate temperature thresholds for each device. This will help avoid giving incorrect warnings when the device is operating within the temperature range it is designed to handle.
* Because there is no explicit information about the equipment type in the dataset, I will add a new column for the equipment type (equipment_type) to apply different temperature conditions to each type.

### Checking Outlier Again to Setup Condition
* All the outlier must be risk to Danger

In [5]:
def detect_outliers(df, column):
    # Calculate Q1, Q3 and IQR values
    q1, q3 = df.approxQuantile(column, [0.25, 0.75], 0.01)
    iqr = q3 - q1

    # Calculate threshold for outliers
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Count the number of outliers
    outlier_count = df.filter((F.col(column) < lower_bound) | (F.col(column) > upper_bound)).count()
    
    # Đánh dấu cột mới là 'Danger' nếu là ngoại lai, nếu không thì 'Normal' tránh trường hợp xung đột kiểu dữ liệu
    # Ở đây sẽ đánh dấu tất cả ngoại lai cho giá trị Danger vì ngoại lai chắc chắn out range
    return df.withColumn(
        f"{column}_is_outlier",
        F.when((F.col(column) < lower_bound) | (F.col(column) > upper_bound), "Danger")
        .otherwise("not_outlier") # ở đây những giá trị không phải outlier sẽ được lưu dưới giá trị not_outlier 
        # và chờ xử lý ở hàm điều kiện bên dưới, vì nó không outlier nhưng mình chưa biết nó Danger, Warning hay Normal
    )

### Calculated Mean and Median to reset the condition if the class Imbalance occur

In [6]:
from pyspark.sql import functions as F

def calculate_statistics(df):
    columns = [
        "temperature", "vibration", "power_output", "noise_level", "voltage", "current",
        "oil_viscosity", "pressure_pct_of_max", "speed_pct_of_max"
    ]
    
    # Mean, Median and Min for each column
    statistics = {}
    
    for column in columns:
        avg_value = df.agg(F.avg(column).alias(f"{column}_mean")).collect()[0][0]
        median_value = df.agg(F.expr(f"percentile_approx({column}, 0.5)").alias(f"{column}_median")).collect()[0][0]
        min_value = df.agg(F.min(column).alias(f"{column}_min")).collect()[0][0]  # Tính toán giá trị min
        max_value = df.agg(F.max(column).alias(f"{column}_max")).collect()[0][0]
        
        
        statistics[column] = {
            "mean": avg_value,
            "median": median_value,
            "min": min_value,  # Thêm giá trị min vào dictionary
            "max": max_value

        }
    
    return statistics

In [7]:
# statistics_results = calculate_statistics(df)
# 
# # In ra kết quả
# for column, stats in statistics_results.items():
#     print(f"{column} - Mean: {stats['mean']}, Median: {stats['median']}, Min: {stats['min']}, Max: {stats['max']}")

* temperature - Mean: 60.004399469304055, Median: 60.002754, Min: 9.845933, Max: 108.521286
* vibration - Mean: 0.5000061973427192, Median: 0.50003743, Min: 0.0037688063, Max: 1.0136985
* power_output - Mean: 499.9713855781973, Median: 499.97165, Min: 247.90448, Max: 761.1027
* noise_level - Mean: 70.00087360821533, Median: 69.99943, Min: 43.538265, Max: 95.24965
* voltage - Mean: 219.99585671643035, Median: 219.99315, Min: 170.51266, Max: 268.44962
* current - Mean: 100.00216587919647, Median: 99.99826, Min: 46.68723, Max: 150.18857
* oil_viscosity - Mean: 49.999960665454616, Median: 50.001106, Min: 23.71131, Max: 76.15404
* pressure_pct_of_max - Mean: 57.81472383758579, Median: 57.42169497723175, Min: 1.75786780081056, Max: 127.73101939525205
* speed_pct_of_max - Mean: 69.49498109640838, Median: 66.97644685811004, Min: 26.957618009699598, Max: 142.21353056113466

### Calculated the Interval Range of each columns for case Class Imblance

In [8]:
from pyspark.sql import functions as F

def calculate_intervals(df, column):
    # Calculate Q1, Q3 and IQR values
    q1, q3 = df.approxQuantile(column, [0.25, 0.75], 0.01)
    iqr = q3 - q1

    # Calculate threshold for outliers
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Return interval range
    return (lower_bound, upper_bound)

# # Danh sách các cột cần tính toán
# columns_to_check = [
#     "temperature", "vibration", "power_output", "noise_level", 
#     "voltage", "current", "oil_viscosity", "pressure_pct_of_max", "speed_pct_of_max"
# ]
# 
# # Tính toán khoảng giá trị cho tất cả các cột và in ra kết quả
# for column in columns_to_check:
#     lower_bound, upper_bound = calculate_intervals(df, column)
#     print(f"Column: {column} - Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")

### Checking all the outlier columns after initialize
* Because we need to set all the value of outlier to True and False, so we can using it to checking the condition of the System Warning
* Also because in the EDA part, decision is keeping the outlier => all the outlier here will be the Danger 


In [9]:
# Call detect_outliers function for columns in correlation condition
# Because column_is_outlier is only created in the detect_outliers function
df = detect_outliers(df, "temperature")
df = detect_outliers(df, "vibration")
df = detect_outliers(df, "noise_level")
df = detect_outliers(df, "oil_viscosity")
df = detect_outliers(df, "power_output")
df = detect_outliers(df, "pressure_pct_of_max")
df = detect_outliers(df, "speed_pct_of_max")
df = detect_outliers(df, "voltage")
df = detect_outliers(df, "current")

In [10]:
# Checking for the new columns represent outlier value 
df.show(5)

+------------+--------------------+-----------+----------+---------+----------------+------------+-----------+---------+----------+-------------+----------+-------------+--------------------+---------------+------------+--------------------+-----------------------+---------------------+--------------------+---------+-----------+----------------+--------------------+-------------+--------------+---------+--------------+------------------+--------------------+---------------+---------------+--------------+-----------+------------+--------------------+-------------------+----------------+--------------------+----------------------+------------------+-------------------+------------------+-------------------+-----------------+---------------------------+--------------------------+-------------+-------------------+------------------------+--------------------------+--------------------+----------------------------+----------------------+----------------------+--------------------+----------

In [11]:
# Checking datatype again
df.printSchema()

root
 |-- equipment_id: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- temperature: double (nullable = true)
 |-- vibration: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- rotational_speed: double (nullable = true)
 |-- power_output: double (nullable = true)
 |-- noise_level: double (nullable = true)
 |-- voltage: double (nullable = true)
 |-- current: double (nullable = true)
 |-- oil_viscosity: double (nullable = true)
 |-- model: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- installation_date: timestamp (nullable = true)
 |-- max_temperature: double (nullable = true)
 |-- max_pressure: double (nullable = true)
 |-- max_rotational_speed: double (nullable = true)
 |-- expected_lifetime_years: double (nullable = true)
 |-- warranty_period_years: integer (nullable = true)
 |-- last_major_overhaul: timestamp (nullable = true)
 |-- location: string (nullable = true)
 |-- criticality: string (nullable = true)
 |

In [12]:
def add_equipment_classifications(df):
    # Keep equipment_type logic same
    window_spec = Window.partitionBy("equipment_id")
    df = df.withColumn(
        "avg_temp", F.avg("temperature").over(window_spec)
    ).withColumn(
        "equipment_type",
        F.when(F.col("avg_temp") < 50, "cold").otherwise("hot")
    )

    # Modified temperature warning thresholds
    df = df.withColumn(
        "temp_warning",
        F.when(F.col("temperature_is_outlier") == "Danger", "Danger")
         .when((F.col("equipment_type") == "cold") & (F.col("temperature") > 25), "Danger")
         .when((F.col("equipment_type") == "cold") & (F.col("temperature") > 20), "Warning")
         .when((F.col("equipment_type") == "hot") & (F.col("temperature") > 105), "Danger")
         .when((F.col("equipment_type") == "hot") & (F.col("temperature") < 55), "Warning")
         .otherwise("Normal")
    )
    
    # Adjusted vibration thresholds
    df = df.withColumn(
        "vibration_warning",
        F.when(F.col("vibration_is_outlier") == "Danger", "Danger")
         .when(F.col("vibration") > 0.9, "Danger")
         .when(F.col("vibration") > 0.75, "Warning")
         .otherwise("Normal")
    )

    # Modified pressure thresholds
    df = df.withColumn(
        "pressure_warning",
        F.when(F.col("pressure_pct_of_max_is_outlier") == "Danger", "Danger")
         .when(F.col("pressure_pct_of_max") >= 95, "Danger")
         .when(F.col("pressure_pct_of_max") >= 85, "Warning")
         .otherwise("Normal")
    )

    # Adjusted speed thresholds
    df = df.withColumn(
        "speed_warning",
        F.when(F.col("speed_pct_of_max_is_outlier") == "Danger", "Danger")
         .when(F.col("speed_pct_of_max") >= 95, "Danger")
         .when(F.col("speed_pct_of_max") >= 85, "Warning")
         .otherwise("Normal")
    )

    # Modified power thresholds
    df = df.withColumn(
        "power_warning",
        F.when(F.col("power_output_is_outlier") == "Danger", "Danger")
         .when(F.col("power_output") > 700, "Danger")
         .when(F.col("power_output") > 650, "Warning")
         .otherwise("Normal")
    )

    # Adjusted noise thresholds
    df = df.withColumn(
        "noise_warning",
        F.when(F.col("noise_level_is_outlier") == "Danger", "Danger")
         .when(F.col("noise_level") > 92, "Danger")
         .when(F.col("noise_level") > 85, "Warning")
         .otherwise("Normal")
    )

    # Modified oil viscosity thresholds
    df = df.withColumn(
        "oil_warning",
        F.when(F.col("oil_viscosity_is_outlier") == "Danger", "Danger")
         .when((F.col("oil_viscosity") < 20) | (F.col("oil_viscosity") > 75), "Danger")
         .when((F.col("oil_viscosity") < 35) | (F.col("oil_viscosity") > 65), "Warning")
         .otherwise("Normal")
    )
    
    # Combine all warnings to determine system health (system_warning)
    df = df.withColumn(
        "system_warning",
        F.when((F.col("temp_warning") == "Danger") | 
               (F.col("vibration_warning") == "Danger") | 
               (F.col("pressure_warning") == "Danger") | 
               (F.col("speed_warning") == "Danger") | 
               (F.col("noise_warning") == "Danger") | 
               (F.col("oil_warning") == "Danger"), "Danger")
         .when((F.col("temp_warning") == "Warning") | 
               (F.col("vibration_warning") == "Warning") | 
               (F.col("pressure_warning") == "Warning") | 
               (F.col("speed_warning") == "Warning") | 
               (F.col("power_warning") == "Warning") | 
               (F.col("noise_warning") == "Warning") | 
               (F.col("oil_warning") == "Warning"), "Warning")
        .otherwise("Normal")
    )   
    return df

In [13]:
df_with_warnings = add_equipment_classifications(df)

### Checking the new dataset contain the label for output1 "system_warning"

In [14]:
df_with_warnings.show(5)

+------------+--------------------+-----------+----------+---------+----------------+------------+-----------+---------+---------+-------------+----------+-------------+--------------------+---------------+------------+--------------------+-----------------------+---------------------+--------------------+---------+-----------+----------------+--------------------+-------------+--------------+---------+--------------+------------------+--------------------+---------------+------------------+------------------+-----------+------------+--------------------+-------------------+----------------+--------------------+----------------------+------------------+-------------------+------------------+-------------------+-----------------+---------------------------+--------------------------+-------------+-------------------+------------------------+--------------------------+--------------------+----------------------------+----------------------+----------------------+--------------------+----

In [15]:
# Kiểm tra tất cả các giá trị duy nhất trong cột system_warning
unique_system_warnings = df_with_warnings.select("system_warning").distinct()
unique_system_warnings.show()

+--------------+
+--------------+
|        Danger|
|        Normal|
+--------------+



In [16]:
# Đếm số lượng các giá trị trong cột system_warning
summary = df_with_warnings.groupBy("system_warning").count()

# Tính tổng số bản ghi để tính tỷ lệ phần trăm
total_count = df_with_warnings.count()

# Tính tỷ lệ phần trăm cho mỗi loại
summary = summary.withColumn("percent", (F.col("count") / total_count) * 100)

# Hiển thị kết quả
summary.show()

+--------------+-------+------------------+
+--------------+-------+------------------+
|        Danger| 469405|10.721904979442668|
|        Normal|2350145| 53.68079031521242|
+--------------+-------+------------------+



# MODEL DEVELOPMENT
### Focus on and compare 2 type of Model 
* 1 Gradient Boosting Machines (GBM)
* 2 Random Forest

### Setup

In [17]:
!pip install scikit-learn
!pip install xgboost
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn



In [18]:
# Imports
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
import joblib

from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [19]:
df_with_warnings.printSchema()

root
 |-- equipment_id: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- temperature: double (nullable = true)
 |-- vibration: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- rotational_speed: double (nullable = true)
 |-- power_output: double (nullable = true)
 |-- noise_level: double (nullable = true)
 |-- voltage: double (nullable = true)
 |-- current: double (nullable = true)
 |-- oil_viscosity: double (nullable = true)
 |-- model: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- installation_date: timestamp (nullable = true)
 |-- max_temperature: double (nullable = true)
 |-- max_pressure: double (nullable = true)
 |-- max_rotational_speed: double (nullable = true)
 |-- expected_lifetime_years: double (nullable = true)
 |-- warranty_period_years: integer (nullable = true)
 |-- last_major_overhaul: timestamp (nullable = true)
 |-- location: string (nullable = true)
 |-- criticality: string (nullable = true)
 |

In [20]:
# Kiểm tra kiểu dữ liệu
print("Kiểu dữ liệu của các cột:")
for col_name in df_with_warnings.columns:
    print(f"{col_name}: {df_with_warnings.select(col_name).dtypes[0][1]}")

Kiểu dữ liệu của các cột:
equipment_id: int
timestamp: timestamp
temperature: double
vibration: double
pressure: double
rotational_speed: double
power_output: double
noise_level: double
voltage: double
current: double
oil_viscosity: double
model: string
manufacturer: string
installation_date: timestamp
max_temperature: double
max_pressure: double
max_rotational_speed: double
expected_lifetime_years: double
warranty_period_years: int
last_major_overhaul: timestamp
location: string
criticality: string
maintenance_type: string
description: string
technician_id: int
duration_hours: double
cost: double
parts_replaced: string
maintenance_result: string
maintenance_date: timestamp
production_rate: double
operating_hours: double
downtime_hours: double
operator_id: int
product_type: string
raw_material_quality: string
ambient_temperature: double
ambient_humidity: double
operation_date: timestamp
days_since_maintenance: int
equipment_age_days: int
days_since_overhaul: int
temp_pct_of_max: double

In [21]:
# Encode categorical variables
indexer = StringIndexer(inputCol='system_warning', outputCol='system_warning_encoded')
final_df = indexer.fit(df_with_warnings).transform(df_with_warnings)
final_df.show(5)

+------------+--------------------+-----------+----------+---------+----------------+------------+-----------+---------+---------+-------------+----------+-------------+--------------------+---------------+------------+--------------------+-----------------------+---------------------+--------------------+---------+-----------+----------------+--------------------+-------------+--------------+---------+--------------+------------------+--------------------+---------------+------------------+------------------+-----------+------------+--------------------+-------------------+----------------+--------------------+----------------------+------------------+-------------------+------------------+-------------------+-----------------+---------------------------+--------------------------+-------------+-------------------+------------------------+--------------------------+--------------------+----------------------------+----------------------+----------------------+--------------------+----

In [22]:
# Kiểm tra kiểu dữ liệu
print("Kiểu dữ liệu của các cột:")
for col_name in final_df.columns:
    print(f"{col_name}: {final_df.select(col_name).dtypes[0][1]}")

Kiểu dữ liệu của các cột:
equipment_id: int
timestamp: timestamp
temperature: double
vibration: double
pressure: double
rotational_speed: double
power_output: double
noise_level: double
voltage: double
current: double
oil_viscosity: double
model: string
manufacturer: string
installation_date: timestamp
max_temperature: double
max_pressure: double
max_rotational_speed: double
expected_lifetime_years: double
warranty_period_years: int
last_major_overhaul: timestamp
location: string
criticality: string
maintenance_type: string
description: string
technician_id: int
duration_hours: double
cost: double
parts_replaced: string
maintenance_result: string
maintenance_date: timestamp
production_rate: double
operating_hours: double
downtime_hours: double
operator_id: int
product_type: string
raw_material_quality: string
ambient_temperature: double
ambient_humidity: double
operation_date: timestamp
days_since_maintenance: int
equipment_age_days: int
days_since_overhaul: int
temp_pct_of_max: double

In [23]:
# Chọn features và label
feature_columns = [col for col in df_with_warnings.columns if col not in [
    'system_warning', 
    'system_warning_encoded', 
    'equipment_type', 
    'equipment_id', 
    'timestamp', 
    'rotational_speed', 
    'voltage', 
    'current', 
    'model', 
    'manufacturer', 
    'installation_date', 
    'max_temperature', 
    'max_pressure', 
    'max_rotational_speed', 
    'expected_lifetime_years', 
    'warranty_period_years', 
    'last_major_overhaul', 
    'location', 
    'criticality', 
    'maintenance_type', 
    'description', 
    'technician_id', 
    'duration_hours', 
    'cost', 
    'parts_replaced', 
    'maintenance_result', 
    'maintenance_date', 
    'production_rate', 
    'operating_hours', 
    'downtime_hours', 
    'operator_id', 
    'product_type', 
    'raw_material_quality', 
    'ambient_temperature', 
    'ambient_humidity', 
    'operation_date', 
    'days_since_maintenance', 
    'equipment_age_days', 
    'days_since_overhaul', 
    'cumulative_maintenance_cost', 
    'cumulative_operating_hours', 
    'estimated_rul', 
    'criticality_encoded', 
    'maintenance_type_encoded', 
    'maintenance_result_encoded', 
    'product_type_encoded', 
    'raw_material_quality_encoded', 
    'parts_replaced_encoded', 
    'temperature_is_outlier', 
    'vibration_is_outlier', 
    'noise_level_is_outlier', 
    'oil_viscosity_is_outlier', 
    'power_output_is_outlier', 
    'pressure_pct_of_max_is_outlier', 
    'speed_pct_of_max_is_outlier', 
    'voltage_is_outlier', 
    'current_is_outlier', 
    'avg_temp', 
    'temp_warning', 
    'vibration_warning', 
    'pressure_warning', 
    'speed_warning', 
    'power_warning', 
    'noise_warning', 
    'oil_warning']]

In [24]:
# Tạo vector features
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data_model = assembler.transform(final_df)

# Chia train/test (70/30)
train_data, test_data = data_model.randomSplit([0.8, 0.2], seed=42)

In [25]:
train_data.show(5)

+------------+--------------------+-----------+----------+---------+----------------+------------+-----------+---------+---------+-------------+----------+-------------+--------------------+---------------+------------+--------------------+-----------------------+---------------------+--------------------+---------+-----------+----------------+--------------------+-------------+--------------+---------+--------------+------------------+--------------------+---------------+------------------+------------------+-----------+------------+--------------------+-------------------+----------------+--------------------+----------------------+------------------+-------------------+------------------+-------------------+-----------------+---------------------------+--------------------------+-------------+-------------------+------------------------+--------------------------+--------------------+----------------------------+----------------------+----------------------+--------------------+----

In [None]:
# # GBM Model
gbm = GBTClassifier(labelCol="system_warning_encoded", featuresCol="features", maxIter=10)
gbm_model = gbm.fit(train_data)

# Random Forest Model
rf = RandomForestClassifier(labelCol="system_warning_encoded", featuresCol="features", numTrees=100)
rf_model = rf.fit(train_data)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="label")
# 
# # Dự đoán và đánh giá
gbm_predictions = gbm_model.transform(test_data)
rf_predictions = rf_model.transform(test_data)
# 
# # Tính AUC-ROC
gbm_auc = evaluator.evaluate(gbm_predictions)
rf_auc = evaluator.evaluate(rf_predictions)
# 
print(f"GBM AUC: {gbm_auc}")
print(f"Random Forest AUC: {rf_auc}")

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col

# Tạo hàm đánh giá metrics
def evaluate_model(predictions, model_name):
    # Accuracy
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    
    # Precision
    evaluator.setMetricName("weightedPrecision")
    precision = evaluator.evaluate(predictions)
    
    # Recall
    evaluator.setMetricName("weightedRecall")
    recall = evaluator.evaluate(predictions)
    
    # F1 Score
    evaluator.setMetricName("f1")
    f1 = evaluator.evaluate(predictions)
    
    print(f"\nMetrics for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    # Confusion Matrix
    predictions_and_labels = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(predictions_and_labels.rdd.map(lambda x: (float(x[0]), float(x[1]))))
    print("\nConfusion Matrix:")
    print(metrics.confusionMatrix().toArray())
# 
# # Đánh giá cho cả 2 mô hình
evaluate_model(gbm_predictions, "Gradient Boosting Machine")
evaluate_model(rf_predictions, "Random Forest")