In [6]:
import pyspark
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, TimestampType
from pyspark.sql import SparkSession

In [7]:
spark=SparkSession.builder.appName('Practise').getOrCreate()

In [8]:
df_pyspark = spark.read.csv("C:\\Users\\admin\\Desktop\\University\\Big Data\\Predictive-Maintenance-System-using-Apache-Spark\\Data Processing & Analysis\\Dataset\\data_with_alert.csv",header=True, inferSchema = True)
df_pyspark.show(5, truncate=False)

+------------+-----------------------+-----------+----------+---------+----------------+------------+-----------+---------+----------+-------------+----------+-------------+-----------------------+---------------+------------+--------------------+-----------------------+---------------------+-----------------------+---------+-----------+----------------+-----------------------------+-------------+--------------+---------+--------------+------------------+-----------------------+---------------+---------------+--------------+-----------+------------+--------------------+-------------------+----------------+-----------------------+----------------------+------------------+-------------------+------------------+-------------------+-----------------+---------------------------+--------------------------+-------------+-------------------+------------------------+--------------------------+--------------------+----------------------------+----------------------+-----------------+------------

# Making the condition 2 #

In [9]:
from pyspark.sql.functions import when, col

# Change those type in alert column to %
output2 = df_pyspark.withColumn("alert_score",
    when(col("alert") == "Normal", 0)
    .when(col("alert") == "Warning", 30)
    .when(col("alert") == "Danger", 50)
)

In [10]:
output2.select("alert", "alert_score").show()

+-------+-----------+
|  alert|alert_score|
+-------+-----------+
| Normal|          0|
| Normal|          0|
| Normal|          0|
| Normal|          0|
| Danger|         50|
| Normal|          0|
| Danger|         50|
| Normal|          0|
| Normal|          0|
| Normal|          0|
| Danger|         50|
+-------+-----------+
only showing top 20 rows



## Do the condition (as output 1) / + 10% or not for each columns

## Columns: 
##          - equipment_age_days / expected_lifetime_years (70-80%)
##          - downtime_hours / operating_hours + downtime_hours > 5%
##          - days_since_maintenance / warranty_period_days > 50%
##          - ambient_temperature > 40 or ambient_humidity > 60
##          - cumulative_maintenance_cost / warranty_period_years

In [11]:
from pyspark.sql.functions import when, col

# Filter out rows where any of the relevant columns have negative values
filtered_output2 = output2.filter(
    (col("equipment_age_days") >= 0) &
    (col("operating_hours") >= 0) &
    (col("downtime_hours") >= 0) &
    (col("days_since_maintenance") >= 0) &
    (col("warranty_period_years") >= 0) &
    (col("cumulative_maintenance_cost") >= 0)
)

# 1. Age Condition Score
filtered_output2 = filtered_output2.withColumn(
    "age_condition_score",
    when(col("equipment_age_days") / (col("expected_lifetime_years") * 365) >= 0.65, 10).otherwise(0)
)

# 2. Downtime Condition Score
filtered_output2 = filtered_output2.withColumn(
    "downtime_condition_score",
    when((col("downtime_hours") / (col("operating_hours") + col("downtime_hours"))) > 0.05, 10).otherwise(0)
)

# 3. Maintenance Condition Score (increased threshold to 60%)
filtered_output2 = filtered_output2.withColumn(
    "maintenance_condition_score",
    when((col("days_since_maintenance") / (col("warranty_period_years") * 365)) > 0.5, 10).otherwise(0)
)

# 4. Environment Condition Score (increased thresholds for extreme conditions)
filtered_output2 = filtered_output2.withColumn(
    "environment_condition_score",
    when((col("ambient_temperature") > 40) | (col("ambient_humidity") > 60), 10).otherwise(0)
)

# 5. Maintenance Cost Condition Score (increased threshold)
annual_cost_threshold = 750000  # Adjusted threshold based on high initial percentage
filtered_output2 = filtered_output2.withColumn(
    "maintenance_cost_condition_score",
    when((col("cumulative_maintenance_cost") / col("warranty_period_years")) > annual_cost_threshold, 10).otherwise(0)
)

# Calculate the total operational score by summing up all condition scores
filtered_output2 = filtered_output2.withColumn(
    "operational_score",
    col("age_condition_score") + col("downtime_condition_score") + col("maintenance_condition_score") + col("environment_condition_score") + col("maintenance_cost_condition_score")
)

# Show the result
filtered_output2.select("age_condition_score", "downtime_condition_score", "maintenance_condition_score", "environment_condition_score", "maintenance_cost_condition_score", "operational_score").show()

+-------------------+------------------------+---------------------------+---------------------------+--------------------------------+-----------------+
|age_condition_score|downtime_condition_score|maintenance_condition_score|environment_condition_score|maintenance_cost_condition_score|operational_score|
+-------------------+------------------------+---------------------------+---------------------------+--------------------------------+-----------------+
|                  0|                       0|                         10|                          0|                               0|               10|
|                  0|                       0|                         10|                          0|                               0|               10|
|                  0|                      10|                         10|                          0|                               0|               20|
|                  0|                      10|                         10|  

In [12]:
from pyspark.sql import functions as F

# List of condition columns to check
condition_columns = [
    "age_condition_score",
    "downtime_condition_score",
    "maintenance_condition_score",
    "environment_condition_score",
    "maintenance_cost_condition_score"
]

# Calculate percentage of rows with 10% in each column
percentages = {}
for column in condition_columns:
    count_10_percent = filtered_output2.filter(F.col(column) == 10).count()
    total_count = filtered_output2.count()
    percentage = (count_10_percent / total_count) * 100
    percentages[column] = percentage

# Display the results
for column, percentage in percentages.items():
    print(f"Percentage of rows with 10% in {column}: {percentage:.2f}%")

Percentage of rows with 10% in age_condition_score: 3.99%
Percentage of rows with 10% in downtime_condition_score: 75.75%
Percentage of rows with 10% in maintenance_condition_score: 39.07%
Percentage of rows with 10% in environment_condition_score: 25.05%
Percentage of rows with 10% in maintenance_cost_condition_score: 70.94%


##

In [13]:
# Calculate the total score by adding alert_score and operational_score
filtered_output2 = filtered_output2.withColumn("total_score", col("alert_score") + col("operational_score"))

# Define the "broken" status based on total_score
filtered_output2 = filtered_output2.withColumn(
    "broken",
    when(col("total_score") >= 70, "yes").otherwise("no")
)

# Show the results with relevant columns
filtered_output2.select("equipment_id", "alert_score", "operational_score", "total_score", "broken").show(truncate=False)

+------------+-----------+-----------------+-----------+------+
|equipment_id|alert_score|operational_score|total_score|broken|
+------------+-----------+-----------------+-----------+------+
|1           |0          |10               |10         |no    |
|1           |30         |10               |40         |no    |
|1           |30         |20               |50         |no    |
|1           |0          |20               |20         |no    |
|1           |30         |20               |50         |no    |
|1           |30         |20               |50         |no    |
|1           |30         |10               |40         |no    |
|1           |30         |10               |40         |no    |
|1           |0          |10               |10         |no    |
|1           |30         |10               |40         |no    |
|1           |0          |20               |20         |no    |
|1           |50         |20               |70         |yes   |
|1           |0          |20            

In [14]:
# Calculate the total count of rows
total_count = filtered_output2.count()

# Calculate the count of 'yes' and 'no' in the 'broken' column
df_counts = filtered_output2.groupBy("broken").count()

# Calculate the percentage for each category
df_percentage = df_counts.withColumn("percentage", (col("count") / total_count) * 100)

# Show the result
df_percentage.select("broken", "count", "percentage").show(truncate=False)

+------+-------+------------------+
|broken|count  |percentage        |
+------+-------+------------------+
|no    |3497112|86.75566052012037 |
|yes   |533878 |13.244339479879633|
+------+-------+------------------+



# Train model # 

## XGBoost (overfitting) 

In [15]:
from pyspark.sql.functions import when
from pyspark.ml.feature import VectorAssembler

# Convert 'broken' column to binary label
filtered_output2 = filtered_output2.withColumn("label", when(col("broken") == "yes", 1).otherwise(0))

# Define the feature columns
feature_cols = ["alert_score", "operational_score"]  # Add any other relevant features

# Assemble feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
final_data = assembler.transform(filtered_output2)

In [16]:
train_data, test_data = final_data.select("features", "label").randomSplit([0.8, 0.2], seed=42)

In [17]:
# Convert train and test data to Pandas
train_df = train_data.toPandas()
test_df = test_data.toPandas()

# Extract features and labels for XGBoost
X_train = train_df["features"].apply(lambda x: x.toArray()).tolist()
y_train = train_df["label"].values
X_test = test_df["features"].apply(lambda x: x.toArray()).tolist()
y_test = test_df["label"].values

In [18]:
import xgboost as xgb

# Calculate the scale_pos_weight
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# Set up XGBoost parameters
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "max_depth": 4,                # Lower the tree depth
    "learning_rate": 0.1,          # Keep the learning rate low
    "n_estimators": 100,           # Reduce number of trees
    "scale_pos_weight": scale_pos_weight,  # Handle class imbalance
    "subsample": 0.8,              # Use only 80% of data per tree
    "colsample_bytree": 0.8,       # Use 80% of features per tree
    "seed": 42
}


In [19]:
# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)

In [20]:
# Train the model with early stopping to avoid overfitting
model = xgb.train(
    params,
    dtrain,
    num_boost_round=200,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)


Parameters: { "n_estimators" } are not used.



[0]	Test-logloss:0.67760
[1]	Test-logloss:0.66549
[2]	Test-logloss:0.59349
[3]	Test-logloss:0.53439
[4]	Test-logloss:0.48520
[5]	Test-logloss:0.47658
[6]	Test-logloss:0.46945
[7]	Test-logloss:0.42804
[8]	Test-logloss:0.42206
[9]	Test-logloss:0.41698
[10]	Test-logloss:0.41269
[11]	Test-logloss:0.40907
[12]	Test-logloss:0.40599
[13]	Test-logloss:0.40337
[14]	Test-logloss:0.36679
[15]	Test-logloss:0.33559
[16]	Test-logloss:0.33303
[17]	Test-logloss:0.30594
[18]	Test-logloss:0.30360
[19]	Test-logloss:0.30160
[20]	Test-logloss:0.27768
[21]	Test-logloss:0.27580
[22]	Test-logloss:0.27416
[23]	Test-logloss:0.25291
[24]	Test-logloss:0.25134
[25]	Test-logloss:0.23265
[26]	Test-logloss:0.21645
[27]	Test-logloss:0.21475
[28]	Test-logloss:0.20036
[29]	Test-logloss:0.18785
[30]	Test-logloss:0.17694
[31]	Test-logloss:0.16742
[32]	Test-logloss:0.15913
[33]	Test-logloss:0.15190
[34]	Test-logloss:0.14909
[35]	Test-logloss:0.14674
[36]	Test-logloss:0.13986
[37]	Test-logloss:0.13766
[38]	Test-logloss:0.13

In [21]:
# Predict probabilities on the test set
y_pred_prob = model.predict(dtest)

# Convert probabilities to binary predictions (e.g., threshold of 0.5)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00


## Random forest 

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Set up the Random Forest model with class weighting to handle imbalance
rf_model = RandomForestClassifier(
    n_estimators=100,              # Number of trees in the forest
    max_depth=6,                   # Max depth of each tree
    class_weight="balanced",       # Automatically adjust weights inversely proportional to class frequencies
    random_state=42                # Seed for reproducibility
)

# Train the model on the training set
rf_model.fit(X_train, y_train)

In [23]:
# Predict on the test set
y_pred = rf_model.predict(X_test)

In [24]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
