### Import libraries

In [39]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd
from datetime import datetime

import pyspark
from pyspark.sql.functions import when, col
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, FloatType, TimestampType
from pyspark.sql.functions import avg, datediff, to_date, when, col
from pyspark.sql import functions as F
from pyspark.sql.functions import col, min, max, lag, datediff, lit, coalesce, to_date, unix_timestamp, from_unixtime

### Setup SparkSession

In [40]:
spark = SparkSession.builder \
    .appName("MaintenanceTimerSystem") \
    .getOrCreate()

### Load 2 dataset

In [41]:
df_1 = spark.read.csv("/home/beboy/Desktop/projects/Predictive-Maintenance-System-using-Apache-Spark/dataset/maintenance_logs.csv",header=True, inferSchema = True)
df_1.show(5, truncate=False)

+------------+--------------------------+----------------+--------------------------------+-------------+--------------+---------+--------------+------------------+
|equipment_id|date                      |maintenance_type|description                     |technician_id|duration_hours|cost     |parts_replaced|maintenance_result|
+------------+--------------------------+----------------+--------------------------------+-------------+--------------+---------+--------------+------------------+
|1           |2021-12-22 21:45:39.156781|Repair          |Repair maintenance performed    |44           |4.0433426     |422.3207 |None          |Successful        |
|1           |2022-05-02 21:45:39.156781|Inspection      |Inspection maintenance performed|36           |5.6460757     |4588.3525|None          |Successful        |
|1           |2022-08-15 21:45:39.156781|Inspection      |Inspection maintenance performed|27           |3.9407096     |195.60114|Filters       |Successful        |
|1        

In [42]:
df_2 = spark.read.csv("/home/beboy/Desktop/projects/Predictive-Maintenance-System-using-Apache-Spark/dataset/df_maintenance.csv",header=True, inferSchema = True)
df_2.show(5, truncate=False)

                                                                                

+------------+-----------------------+-----------+----------+----------+----------------+------------+-----------+---------+---------+-------------+----------+-------------+-----------------------+---------------+------------+--------------------+-----------------------+---------------------+-----------------------+---------+-----------+----------------+---------------------------------+-------------+--------------+---------+--------------+------------------+-----------------------+---------------+------------------+------------------+-----------+------------+--------------------+-------------------+----------------+-----------------------+----------------------+------------------+-------------------+------------------+-------------------+------------------+---------------------------+--------------------------+-------------+-------------------+------------------------+--------------------------+--------------------+----------------------------+----------------------+-----------------+

In [43]:
# Lọc dữ liệu với điều kiện maintenance_needed == "No maintenance required"
filtered_df = df_2.filter(col('maintenance_needed') == "No maintenance required")
filtered_df.show(5, truncate=False)

+------------+-----------------------+-----------+----------+----------+----------------+------------+-----------+---------+---------+-------------+----------+-------------+-----------------------+---------------+------------+--------------------+-----------------------+---------------------+-----------------------+---------+-----------+----------------+---------------------------------+-------------+--------------+---------+--------------+------------------+-----------------------+---------------+------------------+------------------+-----------+------------+--------------------+-------------------+----------------+-----------------------+----------------------+------------------+-------------------+------------------+-------------------+------------------+---------------------------+--------------------------+-------------+-------------------+------------------------+--------------------------+--------------------+----------------------------+----------------------+-----------------+

### Method 1 Usage Based

- Formula:
    1. days_between_maintenance = date - prev_maintenance_date
    2. operational_threshold = avg(days_between_maintenance)
    3. total_current_operating_hours(tổng số giờ vận hành hiện tại) = current_date hiện tại - ngày bảo dưỡng gần nhất(date)
    4. remaining_hours(số ngày còn lại) = operational_threshold(ngưỡng hoạt động) - total_current_operating_hours(tổng số giờ vận hành hiện tại)
    5. remaining_days(số ngày còn lại) = remaining_hours(số giờ còn lại) / avg_daily_operating_hours(số giờ vận hành trung bình mỗi ngày)

In [44]:
def remaining_hours(df):
    # Đặt cửa sổ để tìm ngày bảo dưỡng trước đó cho mỗi thiết bị
    window_spec = Window.partitionBy('equipment_id').orderBy('date')

    # Tính toán 'prev_maintenance_date' và 'days_between_maintenance'
    result_1 = df.withColumn(
        'prev_maintenance_date', F.lag('date').over(window_spec)  # Lấy ngày bảo trì trước đó
    ).withColumn(
        'days_between_maintenance',
        F.datediff(F.col('date'), F.col('prev_maintenance_date'))  # Tính số ngày giữa 2 lần bảo trì
    )
    
    # Tính 'operational_threshold' và join với DataFrame gốc
    avg_threshold = result_1.groupBy('equipment_id') \
        .agg((F.avg('days_between_maintenance') * 24).alias('operational_threshold'))

    result_2 = result_1.join(
        avg_threshold,
        on='equipment_id',
        how='left'
    )
    
    # # Đảm bảo rằng cột 'date' là kiểu timestamp
    result_2 = result_2.withColumn('date', F.col('date').cast('timestamp'))

    # Đặt cửa sổ để sắp xếp theo ngày giảm dần
    window_spec = Window.partitionBy('equipment_id').orderBy(F.col('date').desc())

    # Lấy giá trị 'date' cuối cùng cho mỗi 'equipment_id' (ngày bảo dưỡng gần nhất)
    result_2 = result_2.withColumn('last_maintenance_date', F.first('date', ignorenulls=True).over(window_spec))

    # Tính toán tổng số giờ vận hành hiện tại (current operating hours)
    result_2 = result_2.withColumn(
        'total_current_operating_hours', 
        F.datediff(F.current_date(), F.col('last_maintenance_date')) * 24
    )

    # Tính toán 'remaining_hours' = 'operational_threshold' - 'total_current_operating_hours'
    result_2 = result_2.withColumn(
        'remaining_hours', 
        F.col('operational_threshold') - F.col('total_current_operating_hours')
    )
    
    # Trả về kết quả với cột 'remaining_hours'
    return result_2

# Gọi hàm remaining_hours với DataFrame df hiện tại
result_df1 = remaining_hours(df_1)

# Hiển thị kết quả
result_df1.show(70)

+------------+--------------------+----------------+--------------------+-------------+--------------+----------+--------------+------------------+---------------------+------------------------+---------------------+---------------------+-----------------------------+------------------+
|equipment_id|                date|maintenance_type|         description|technician_id|duration_hours|      cost|parts_replaced|maintenance_result|prev_maintenance_date|days_between_maintenance|operational_threshold|last_maintenance_date|total_current_operating_hours|   remaining_hours|
+------------+--------------------+----------------+--------------------+-------------+--------------+----------+--------------+------------------+---------------------+------------------------+---------------------+---------------------+-----------------------------+------------------+
|           1|2024-11-06 21:45:...|          Repair|Repair maintenanc...|           40|      7.937225| 597.30164|          None|        

### Avg_daily_operating_hours from df_2

In [45]:
# Loại bỏ các dòng trùng lặp trong df_2 theo equipment_id
filtered_df_unique = filtered_df.dropDuplicates(['equipment_id'])

# Gộp df_2 vào result_df1
new_df1 = result_df1.join(
    filtered_df_unique.select('equipment_id', 'operating_hours'),
    on='equipment_id',
    how='left'
)

# Hiển thị kết quả
new_df1.show(70)

                                                                                

+------------+--------------------+----------------+--------------------+-------------+--------------+----------+--------------+------------------+---------------------+------------------------+---------------------+---------------------+-----------------------------+------------------+------------------+
|equipment_id|                date|maintenance_type|         description|technician_id|duration_hours|      cost|parts_replaced|maintenance_result|prev_maintenance_date|days_between_maintenance|operational_threshold|last_maintenance_date|total_current_operating_hours|   remaining_hours|   operating_hours|
+------------+--------------------+----------------+--------------------+-------------+--------------+----------+--------------+------------------+---------------------+------------------------+---------------------+---------------------+-----------------------------+------------------+------------------+
|           1|2024-11-06 21:45:...|          Repair|Repair maintenanc...|      

In [46]:
# calculate new column avg_daily_operating_hours using F.avg() window function
window_spec = Window.partitionBy('equipment_id').orderBy('date')
new_df1 = new_df1.withColumn(
    'avg_daily_operating_hours', F.avg('operating_hours').over(window_spec)
)

# Hiển thị kết quả
new_df1.show(70)

                                                                                

+------------+--------------------+----------------+--------------------+-------------+--------------+----------+--------------+------------------+---------------------+------------------------+---------------------+---------------------+-----------------------------+------------------+------------------+-------------------------+
|equipment_id|                date|maintenance_type|         description|technician_id|duration_hours|      cost|parts_replaced|maintenance_result|prev_maintenance_date|days_between_maintenance|operational_threshold|last_maintenance_date|total_current_operating_hours|   remaining_hours|   operating_hours|avg_daily_operating_hours|
+------------+--------------------+----------------+--------------------+-------------+--------------+----------+--------------+------------------+---------------------+------------------------+---------------------+---------------------+-----------------------------+------------------+------------------+-------------------------+
|

### Remaining days method 1

In [58]:
# calculate new column remaining_days_method1
# remaining_days_method1 = remaining_hours / avg_daily_operating_hours
final_result1 = new_df1.withColumn(
    'remaining_days_method1', F.round(F.col('remaining_hours') / F.col('avg_daily_operating_hours') / 24, 1)
)

# Hiển thị kết quả
final_result1.show(70)

                                                                                

+------------+--------------------+----------------+--------------------+-------------+--------------+----------+--------------+------------------+---------------------+------------------------+---------------------+---------------------+-----------------------------+------------------+------------------+-------------------------+----------------------+
|equipment_id|                date|maintenance_type|         description|technician_id|duration_hours|      cost|parts_replaced|maintenance_result|prev_maintenance_date|days_between_maintenance|operational_threshold|last_maintenance_date|total_current_operating_hours|   remaining_hours|   operating_hours|avg_daily_operating_hours|remaining_days_method1|
+------------+--------------------+----------------+--------------------+-------------+--------------+----------+--------------+------------------+---------------------+------------------------+---------------------+---------------------+-----------------------------+------------------+-

In [49]:
# from pyspark.sql import functions as F
# from pyspark.sql.window import Window

# def method3_health_score(spark, df):
#     # Step 1: Calculate Health Score
#     weights = {
#         "temperature": 0.2,
#         "vibration": 0.3,
#         "pressure": 0.25,
#         "rotational_speed": 0.15,
#         "oil_viscosity": 0.1
#     }
#     result_df = df.withColumn(
#         "health_score",
#         100 - (
#             weights["temperature"] * F.col("temperature") +
#             weights["vibration"] * F.col("vibration") +
#             weights["pressure"] * F.col("pressure") +
#             weights["rotational_speed"] * F.col("rotational_speed") +
#             weights["oil_viscosity"] * F.col("oil_viscosity")
#         )
#     )
#       # Apply absolute value to health_score to avoid negative values
#     result_df = result_df.withColumn(
#         "health_score",
#         F.abs(F.col("health_score"))
#     )

#     # Step 2: Track health score trends
#     window_spec = Window.partitionBy("equipment_id").orderBy("timestamp")
#     result_df = result_df.withColumn(
#         "health_score_change",
#         F.coalesce(F.col("health_score") - F.lag("health_score", 1).over(window_spec), F.lit(0))
#     ).withColumn(
#         "days_between_measurements",
#         F.coalesce(F.datediff(F.col("timestamp"), F.lag("timestamp", 1).over(window_spec)), F.lit(1))
#     ).withColumn(
#         "decline_rate_per_day",
#         F.col("health_score_change") / F.col("days_between_measurements")
#     )

#     # Step 3: Predict remaining days
#     threshold_score = 60  # Threshold for maintenance
#     result_df = result_df.withColumn(
#         "remaining_days_method3",
#         (F.col("health_score") - F.lit(threshold_score)) /
#         F.abs(F.coalesce(F.col("decline_rate_per_day"), F.lit(0.1)))
#     )

#     return result_df

In [50]:
# from pyspark.sql import functions as F
# from pyspark.sql.functions import lit, col

# def calculate_weighted_average(self, df):
#     """
#     Calculate the weighted average of all three methods
#     """
#     # Define the weights for each method
#     self.weights = {
#         'method1': 0.4,  # Phương pháp 2: Mức độ sử dụng (40%)
#         'method2': 0.3,  # Phương pháp 3: Lịch sử bảo trì (30%)
#         'method3': 0.3   # Phương pháp 4: Điểm số sức khỏe (30%)
#     }

#     # Initialize expression with first method (method 1)
#     weighted_avg_expr = col('remaining_days_method1') * lit(self.weights['method1'])

#     # Add other methods (method 2 and method 3)
#     weighted_avg_expr = weighted_avg_expr + \
#         (col('remaining_days_method2') * lit(self.weights['method2'])) + \
#         (col('remaining_days_method3') * lit(self.weights['method3']))

#     # Calculate final remaining days as the weighted average
#     return df.withColumn('final_remaining_days', weighted_avg_expr)