In [1]:
import pyspark
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, TimestampType

In [2]:
from pyspark.sql import SparkSession

In [4]:
spark=SparkSession.builder.appName('Practise').getOrCreate()

In [5]:
def define_schema():
    return StructType([
        StructField("equipment_id", IntegerType(), True),
        StructField("timestamp", TimestampType(), True),
        StructField("temperature", DoubleType(), True),
        StructField("vibration", DoubleType(), True),
        StructField("pressure", DoubleType(), True),
        StructField("rotational_speed", DoubleType(), True),
        StructField("power_output", DoubleType(), True),
        StructField("noise_level", DoubleType(), True),
        StructField("voltage", DoubleType(), True),
        StructField("current", DoubleType(), True),
        StructField("oil_viscosity", DoubleType(), True),
        StructField("model", StringType(), True),
        StructField("manufacturer", StringType(), True),
        StructField("installation_date", TimestampType(), True),
        StructField("max_temperature", DoubleType(), True),
        StructField("max_pressure", DoubleType(), True),
        StructField("max_rotational_speed", DoubleType(), True),
        StructField("expected_lifetime_years", DoubleType(), True),
        StructField("warranty_period_years", IntegerType(), True),
        StructField("last_major_overhaul", TimestampType(), True),
        StructField("location", StringType(), True),
        StructField("criticality", StringType(), True),
        StructField("maintenance_type", StringType(), True),
        StructField("description", StringType(), True),
        StructField("technician_id", IntegerType(), True),
        StructField("duration_hours", DoubleType(), True),
        StructField("cost", DoubleType(), True),
        StructField("parts_replaced", StringType(), True),
        StructField("maintenance_result", StringType(), True),
        StructField("maintenance_date", TimestampType(), True),
        StructField("production_rate", DoubleType(), True),
        StructField("operating_hours", DoubleType(), True),
        StructField("downtime_hours", DoubleType(), True),
        StructField("operator_id", IntegerType(), True),
        StructField("product_type", StringType(), True),
        StructField("raw_material_quality", StringType(), True),
        StructField("ambient_temperature", DoubleType(), True),
        StructField("ambient_humidity", DoubleType(), True),
        StructField("operation_date", TimestampType(), True),
        StructField("days_since_maintenance", IntegerType(), True),
        StructField("equipment_age_days", IntegerType(), True),
        StructField("days_since_overhaul", IntegerType(), True),
        StructField("temp_pct_of_max", DoubleType(), True),
        StructField("pressure_pct_of_max", DoubleType(), True),
        StructField("speed_pct_of_max", DoubleType(), True),
        StructField("cumulative_maintenance_cost", DoubleType(), True),
        StructField("cumulative_operating_hours", DoubleType(), True),
        StructField("estimated_rul", DoubleType(), True),
        StructField("criticality_encoded", DoubleType(), True),
        StructField("maintenance_type_encoded", DoubleType(), True),
        StructField("maintenance_result_encoded", DoubleType(), True),
        StructField("product_type_encoded", DoubleType(), True),
        StructField("raw_material_quality_encoded", DoubleType(), True),
        StructField("parts_replaced_encoded", DoubleType(), True)
    ])
schema = define_schema()
df_pyspark = spark.read.csv("C:\\Users\\admin\\Desktop\\University\\Big Data\\Predictive-Maintenance-System-using-Apache-Spark\\Data Processing & Analysis\\Dataset\\final_data_update.csv",header=True, schema = schema)
df_pyspark.printSchema()

root
 |-- equipment_id: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- temperature: double (nullable = true)
 |-- vibration: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- rotational_speed: double (nullable = true)
 |-- power_output: double (nullable = true)
 |-- noise_level: double (nullable = true)
 |-- voltage: double (nullable = true)
 |-- current: double (nullable = true)
 |-- oil_viscosity: double (nullable = true)
 |-- model: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- installation_date: timestamp (nullable = true)
 |-- max_temperature: double (nullable = true)
 |-- max_pressure: double (nullable = true)
 |-- max_rotational_speed: double (nullable = true)
 |-- expected_lifetime_years: double (nullable = true)
 |-- warranty_period_years: integer (nullable = true)
 |-- last_major_overhaul: timestamp (nullable = true)
 |-- location: string (nullable = true)
 |-- criticality: string (nullable = true)
 |

In [6]:
df_pyspark.show(10)

+------------+--------------------+-----------+----------+---------+----------------+------------+-----------+---------+----------+-------------+----------+-------------+--------------------+---------------+------------+--------------------+-----------------------+---------------------+--------------------+---------+-----------+----------------+--------------------+-------------+--------------+---------+--------------+------------------+--------------------+---------------+---------------+--------------+-----------+------------+--------------------+-------------------+----------------+--------------------+----------------------+------------------+-------------------+------------------+-------------------+-----------------+---------------------------+--------------------------+-------------+-------------------+------------------------+--------------------------+--------------------+----------------------------+----------------------+
|equipment_id|           timestamp|temperature| vibra

# Temperature #

In [46]:
# Calculate the 60th and 90th percentiles for the temperature column
temperature_percentiles = df_pyspark.approxQuantile("temperature", [0.6, 0.9], 0.0)

# Extract the 60th and 90th percentile values
temperature_60th = temperature_percentiles[0]
temperature_90th = temperature_percentiles[1]

# Print the thresholds
print(f"Temperature 60th percentile (Normal to Warning boundary): {temperature_60th}")
print(f"Temperature 90th percentile (Warning to Danger boundary): {temperature_90th}")



In [47]:
from pyspark.sql.functions import when
# Create a new column 'temperature_category' based on the 60th and 90th percentile thresholds
df_pyspark = df_pyspark.withColumn(
    "temperature_category",
    when(df_pyspark["temperature"] <= temperature_60th, "Normal")
    .when((df_pyspark["temperature"] > temperature_60th) & (df_pyspark["temperature"] <= temperature_90th), "Warning")
    .otherwise("Danger")
)

# Show the result for temperature categories
df_pyspark.select("temperature", "temperature_category").show(10)

+-----------+--------------------+
|temperature|temperature_category|
+-----------+--------------------+
|   47.79757|              Normal|
|  53.125355|              Normal|
|  53.964127|              Normal|
|    74.9544|              Danger|
|  59.535942|              Normal|
|  56.412777|              Normal|
+-----------+--------------------+
only showing top 10 rows



In [48]:
# Calculate total rows for temperature
total_rows_temperature = df_pyspark.count()

# Group by temperature_category and calculate counts
category_distribution_temperature = df_pyspark.groupBy("temperature_category").count()

# Calculate percentages for temperature
category_distribution_temperature = category_distribution_temperature.withColumn(
    "percentage", (category_distribution_temperature["count"] / total_rows_temperature) * 100
)

# Show the distribution for temperature categories
category_distribution_temperature.show()

+--------------------+-------+----------+
|temperature_category|  count|percentage|
+--------------------+-------+----------+
|              Danger| 437800|      10.0|
|              Normal|2626800|      60.0|
+--------------------+-------+----------+



# Pressure #

In [19]:
# # Calculate the 60th and 90th percentiles for the pressure column
# pressure_percentiles = df_pyspark.approxQuantile("pressure", [0.6, 0.9], 0.0)
# 
# # Extract the 60th and 90th percentile values
# pressure_60th = pressure_percentiles[0]  # 60th percentile (Normal -> Warning boundary)
# pressure_90th = pressure_percentiles[1]  # 90th percentile (Warning -> Danger boundary)
# 
# # Print the thresholds
# print(f"Pressure 60th percentile (Normal to Warning boundary): {pressure_60th}")
# print(f"Pressure 90th percentile (Warning to Danger boundary): {pressure_90th}")



In [20]:
# from pyspark.sql.functions import when
# 
# # Create a new column 'pressure_category' based on the 60th and 90th percentile thresholds
# df_pyspark = df_pyspark.withColumn(
#     "pressure_category",
#     when(df_pyspark["pressure"] <= pressure_60th, "Normal")
#     .when((df_pyspark["pressure"] > pressure_60th) & (df_pyspark["pressure"] <= pressure_90th), "Warning")
#     .otherwise("Danger")
# )
# 
# # Show the result with the pressure categories
# df_pyspark.select("pressure", "pressure_category").show(10)

+---------+-----------------+
| pressure|pressure_category|
+---------+-----------------+
| 58.56234|           Normal|
| 56.58919|           Normal|
|  98.2447|           Normal|
| 127.6598|           Danger|
|102.39677|           Normal|
+---------+-----------------+
only showing top 10 rows



In [49]:
from pyspark.sql.functions import when

# Define a function to classify the pressure based on custom thresholds
def check_pressure(df, pressure_column="pressure"):
    df = df.withColumn(
        "pressure_category",
        when(df[pressure_column] <= 120, "Normal")
        .when((df[pressure_column] > 120) & (df[pressure_column] <= 140), "Warning")
        .otherwise("Danger")
    )
    return df

# Apply the function to classify pressure based on the custom thresholds
df_pyspark = check_pressure(df_pyspark)

# Show the result
df_pyspark.select("pressure", "pressure_category").show(10)

+---------+-----------------+
| pressure|pressure_category|
+---------+-----------------+
|111.57932|           Normal|
| 58.56234|           Normal|
| 56.58919|           Normal|
|114.01153|           Normal|
|  98.2447|           Normal|
|102.39677|           Normal|
|113.06311|           Normal|
|111.12285|           Normal|
+---------+-----------------+
only showing top 10 rows



In [50]:
# Calculate total rows
total_rows = df_pyspark.count()

# Group by pressure_category and calculate counts
category_distribution = df_pyspark.groupBy("pressure_category").count()

# Calculate percentages
category_distribution = category_distribution.withColumn(
    "percentage", (category_distribution["count"] / total_rows) * 100
)

# Show the distribution
category_distribution.show()

+-----------------+-------+------------------+
|pressure_category|  count|        percentage|
+-----------------+-------+------------------+
|           Danger|  99495| 2.272613065326633|
|           Normal|3683719| 84.14159433531293|
+-----------------+-------+------------------+



In [14]:
# Filter rows where the pressure category is 'Normal'
normal_pressure_rows = df_pyspark.filter(df_pyspark["pressure_category"] == "Normal")

# Find the maximum pressure value in the 'Normal' category
max_normal_pressure = normal_pressure_rows.agg({"pressure": "max"}).collect()[0][0]

# Print the result
print(f"The maximum pressure value for 'Normal' category is: {max_normal_pressure}")

The maximum pressure value for 'Normal' category is: 179.45386


# Rotational speed #

In [22]:
# # Calculate the 60th and 90th percentiles for the rotational_speed column
# rotational_speed_percentiles = df_pyspark.approxQuantile("rotational_speed", [0.6, 0.9], 0.0)
# 
# # Extract the 60th and 90th percentile values
# rotational_speed_60th = rotational_speed_percentiles[0]
# # rotational_speed_90th = rotational_speed_percentiles[1]
# 
# # Print the thresholds
# print(f"Rotational Speed 60th percentile (Normal to Warning boundary): {rotational_speed_60th}")
# print(f"Rotational Speed 90th percentile (Warning to Danger boundary): {rotational_speed_90th}")



In [23]:
# # Create a new column 'rotational_speed_category' based on the 60th and 90th percentile thresholds
# df_pyspark = df_pyspark.withColumn(
#     "rotational_speed_category",
#     when(df_pyspark["rotational_speed"] <= rotational_speed_60th, "Normal")
#     .when((df_pyspark["rotational_speed"] > rotational_speed_60th) & (df_pyspark["rotational_speed"] <= rotational_speed_90th), "Warning")
#     .otherwise("Danger")
# )
# 
# # Show the result for rotational speed categories
# df_pyspark.select("rotational_speed", "rotational_speed_category").show(10)

+----------------+-------------------------+
|rotational_speed|rotational_speed_category|
+----------------+-------------------------+
|        978.9937|                   Normal|
|       746.84045|                   Normal|
|        857.4468|                   Normal|
|         969.122|                   Normal|
|       990.41327|                   Normal|
+----------------+-------------------------+
only showing top 10 rows



In [51]:
from pyspark.sql.functions import when

# Define a function to classify the rotational_speed based on custom thresholds
def check_rotational_speed(df, rotational_speed_column="rotational_speed"):
    df = df.withColumn(
        "rotational_speed_category",
        when(df[rotational_speed_column] <= 1100, "Normal")
        .when((df[rotational_speed_column] > 1100) & (df[rotational_speed_column] <= 1200), "Warning")
        .otherwise("Danger")
    )
    return df

# Apply the function to classify rotational_speed based on the custom thresholds
df_pyspark = check_rotational_speed(df_pyspark)

# Show the result
df_pyspark.select("rotational_speed", "rotational_speed_category").show(10)

+----------------+-------------------------+
|rotational_speed|rotational_speed_category|
+----------------+-------------------------+
|        978.9937|                   Normal|
|       1046.2716|                   Normal|
|       746.84045|                   Normal|
|        857.4468|                   Normal|
|         969.122|                   Normal|
|       1087.7844|                   Normal|
|       990.41327|                   Normal|
+----------------+-------------------------+
only showing top 10 rows



In [52]:
# Calculate total rows for rotational speed
total_rows_rotational_speed = df_pyspark.count()

# Group by rotational_speed_category and calculate counts
category_distribution_rotational_speed = df_pyspark.groupBy("rotational_speed_category").count()

# Calculate percentages for rotational speed
category_distribution_rotational_speed = category_distribution_rotational_speed.withColumn(
    "percentage", (category_distribution_rotational_speed["count"] / total_rows_rotational_speed) * 100
)

# Show the distribution for rotational speed categories
category_distribution_rotational_speed.show()

+-------------------------+-------+-----------------+
|rotational_speed_category|  count|       percentage|
+-------------------------+-------+-----------------+
|                   Danger|  99121|2.264070351758794|
|                   Normal|3683916|84.14609410689813|
+-------------------------+-------+-----------------+



# Noise level #

In [26]:
# # Calculate the 60th and 90th percentiles for the noise_level column
# noise_level_percentiles = df_pyspark.approxQuantile("noise_level", [0.6, 0.9], 0.0)
# 
# # Extract the 60th and 90th percentile values
# noise_level_60th = noise_level_percentiles[0]
# noise_level_90th = noise_level_percentiles[1]
# 
# # Print the thresholds
# print(f"Noise Level 60th percentile (Normal to Warning boundary): {noise_level_60th}")
# print(f"Noise Level 90th percentile (Warning to Danger boundary): {noise_level_90th}")



In [27]:
# # Create a new column 'noise_level_category' based on the 60th and 90th percentile thresholds
# df_pyspark = df_pyspark.withColumn(
#     "noise_level_category",
#     when(df_pyspark["noise_level"] <= noise_level_60th, "Normal")
#     .when((df_pyspark["noise_level"] > noise_level_60th) & (df_pyspark["noise_level"] <= noise_level_90th), "Warning")
#     .otherwise("Danger")
# )
# 
# # Show the result for noise level categories
# df_pyspark.select("noise_level", "noise_level_category").show(10)

+-----------+--------------------+
|noise_level|noise_level_category|
+-----------+--------------------+
|  76.433655|              Danger|
|  61.318645|              Normal|
|   67.25192|              Normal|
|  69.872986|              Normal|
|   66.44574|              Normal|
|  60.069717|              Normal|
|   68.80365|              Normal|
|    70.8878|              Normal|
+-----------+--------------------+
only showing top 10 rows



In [40]:
from pyspark.sql.functions import when

# Define a function to classify the noise_level based on custom thresholds
def check_noise_level(df, noise_level_column="noise_level"):
    df = df.withColumn(
        "noise_level_category",
        when(df[noise_level_column] <= 75, "Normal")
        .when((df[noise_level_column] > 75) & (df[noise_level_column] <= 82), "Warning")
        .otherwise("Danger")
    )
    return df

# Apply the function to classify noise_level based on the custom thresholds
df_pyspark = check_noise_level(df_pyspark)

# Show the result
df_pyspark.select("noise_level", "noise_level_category").show(10)


+-----------+--------------------+
|noise_level|noise_level_category|
+-----------+--------------------+
|  72.580185|              Normal|
|  61.318645|              Normal|
|   74.14918|              Normal|
|   67.25192|              Normal|
|  69.872986|              Normal|
|   66.44574|              Normal|
|  60.069717|              Normal|
|   68.80365|              Normal|
|    70.8878|              Normal|
+-----------+--------------------+
only showing top 10 rows



In [41]:
# Calculate total rows for noise level
total_rows_noise_level = df_pyspark.count()

# Group by noise_level_category and calculate counts
category_distribution_noise_level = df_pyspark.groupBy("noise_level_category").count()

# Calculate percentages for noise level
category_distribution_noise_level = category_distribution_noise_level.withColumn(
    "percentage", (category_distribution_noise_level["count"] / total_rows_noise_level) * 100
)

# Show the distribution for noise level categories
category_distribution_noise_level.show()

+--------------------+-------+------------------+
|noise_level_category|  count|        percentage|
+--------------------+-------+------------------+
|              Danger|  36126|0.8251713111009593|
|              Normal|3682676| 84.11777067153952|
+--------------------+-------+------------------+



# Voltage #

In [29]:
# # Calculate the 60th and 90th percentiles for the voltage column
# voltage_percentiles = df_pyspark.approxQuantile("voltage", [0.6, 0.9], 0.0)
# 
# # Extract the 60th and 90th percentile values
# voltage_60th = voltage_percentiles[0]
# voltage_90th = voltage_percentiles[1]
# 
# # Print the thresholds
# print(f"Voltage 60th percentile (Normal to Warning boundary): {voltage_60th}")
# print(f"Voltage 90th percentile (Warning to Danger boundary): {voltage_90th}")



In [30]:
# # Create a new column 'voltage_category' based on the 60th and 90th percentile thresholds
# df_pyspark = df_pyspark.withColumn(
#     "voltage_category",
#     when(df_pyspark["voltage"] <= voltage_60th, "Normal")
#     .when((df_pyspark["voltage"] > voltage_60th) & (df_pyspark["voltage"] <= voltage_90th), "Warning")
#     .otherwise("Danger")
# )
# 
# # Show the result for voltage categories
# df_pyspark.select("voltage", "voltage_category").show(10)

+---------+----------------+
|  voltage|voltage_category|
+---------+----------------+
| 217.1716|          Normal|
|236.80861|          Danger|
|  214.704|          Normal|
|219.81172|          Normal|
| 235.6383|          Danger|
|211.35019|          Normal|
+---------+----------------+
only showing top 10 rows



In [44]:
from pyspark.sql.functions import when

# Define a function to classify the voltage based on custom thresholds
def check_voltage(df, voltage_column="voltage"):
    df = df.withColumn(
        "voltage_category",
        when(df[voltage_column] <= 225, "Normal")
        .when((df[voltage_column] > 225) & (df[voltage_column] <= 237), "Warning")
        .otherwise("Danger")
    )
    return df

# Apply the function to classify voltage based on the custom thresholds
df_pyspark = check_voltage(df_pyspark)

# Show the result
df_pyspark.select("voltage", "voltage_category").show(10)

+---------+----------------+
|  voltage|voltage_category|
+---------+----------------+
| 217.1716|          Normal|
|224.70499|          Normal|
|  214.704|          Normal|
|219.81172|          Normal|
| 223.2198|          Normal|
|211.35019|          Normal|
+---------+----------------+
only showing top 10 rows



In [45]:
# Calculate total rows for voltage
total_rows_voltage = df_pyspark.count()

# Group by voltage_category and calculate counts
category_distribution_voltage = df_pyspark.groupBy("voltage_category").count()

# Calculate percentages for voltage
category_distribution_voltage = category_distribution_voltage.withColumn(
    "percentage", (category_distribution_voltage["count"] / total_rows_voltage) * 100
)

# Show the distribution for voltage categories
category_distribution_voltage.show()

+----------------+-------+------------------+
|voltage_category|  count|        percentage|
+----------------+-------+------------------+
|          Danger| 195225|4.4592279579716765|
|          Normal|3028107|  69.1664458656921|
+----------------+-------+------------------+



In [53]:
from pyspark.sql.functions import when

# Combine all the conditions into a single "system_warning" column, including the temperature_category
df_pyspark = df_pyspark.withColumn(
    "system_warning",
    when(
        (df_pyspark["pressure_category"] == "Danger") |
        (df_pyspark["rotational_speed_category"] == "Danger") |
        (df_pyspark["noise_level_category"] == "Danger") |
        (df_pyspark["voltage_category"] == "Danger") |
        (df_pyspark["temperature_category"] == "Danger"), "Danger"
    ).when(
        (df_pyspark["pressure_category"] == "Warning") |
        (df_pyspark["rotational_speed_category"] == "Warning") |
        (df_pyspark["noise_level_category"] == "Warning") |
        (df_pyspark["voltage_category"] == "Warning") |
        (df_pyspark["temperature_category"] == "Warning"), "Warning"
    ).otherwise("Normal")
)

# Drop the individual category columns if not needed
df_pyspark = df_pyspark.drop("pressure_category", "rotational_speed_category", "noise_level_category", "voltage_category", "temperature_category")

# Show the result with just the system warning
df_pyspark.select("system_warning").show(10)

+--------------+
+--------------+
|        Normal|
|        Normal|
|        Danger|
|        Normal|
+--------------+
only showing top 10 rows



In [54]:
def check_row_with_columns(df_pyspark, row_index):
    # Use take to retrieve a specific row based on index
    row = df_pyspark.take(row_index + 1)[-1]  # Retrieve only the row we are interested in

    # Print the relevant columns and the system warning
    print(f"Temperature: {row['temperature']}")
    print(f"Rotational Speed: {row['rotational_speed']}")
    print(f"Noise Level: {row['noise_level']}")
    print(f"Voltage: {row['voltage']}")
    print(f"System Warning: {row['system_warning']}")

# Test the function by checking a specific row
check_row_with_columns(df_pyspark, 0)  # Check the row at index 0

Temperature: 63.102325
Rotational Speed: 978.9937
Noise Level: 72.580185
Voltage: 217.1716


In [55]:
def check_multiple_rows(df_pyspark, num_rows):
    # Select the relevant columns
    df_pyspark.select("pressure","temperature", "rotational_speed", "noise_level", "voltage", "system_warning").show(num_rows)

# Show the first 5 rows as an example
check_multiple_rows(df_pyspark, 5)


+---------+-----------+----------------+-----------+---------+--------------+
+---------+-----------+----------------+-----------+---------+--------------+
| 56.58919|   47.79757|       1046.2716|  61.318645|224.70499|        Normal|
|114.01153|  53.125355|       746.84045|   74.14918|  214.704|        Normal|
+---------+-----------+----------------+-----------+---------+--------------+
only showing top 5 rows



# Train model #

In [56]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

# Step 1: Split the dataset into training and test sets (80% train, 20% test)
train_data, test_data = df_pyspark.randomSplit([0.8, 0.2], seed=42)

# Step 2: Convert the 'system_warning' label to numeric using StringIndexer
indexer = StringIndexer(inputCol="system_warning", outputCol="label")
train_data = indexer.fit(train_data).transform(train_data)
test_data = indexer.fit(test_data).transform(test_data)

# Step 3: Assemble features (you can choose any relevant features for your model)
assembler = VectorAssembler(
    inputCols=[
        "temperature", "rotational_speed", "noise_level", "voltage", 
        "pressure"  # Add other features if needed
    ],
    outputCol="features"
)

train_data = assembler.transform(train_data)
test_data = assembler.transform(test_data)

# Step 4: Train the model (RandomForestClassifier in this case)
rf = RandomForestClassifier(featuresCol="features", labelCol="label")
model = rf.fit(train_data)

# Step 5: Test the model on the test set
predictions = model.transform(test_data)

# Step 6: Evaluate the model using accuracy metric
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy}")

# Step 7: Show some of the predictions
predictions.select("features", "system_warning", "prediction").show(10)

Test Accuracy = 0.9328097202813982
+--------------------+--------------+----------+
+--------------------+--------------+----------+
|[47.79757,1046.27...|        Normal|       1.0|
|[74.9544,969.122,...|        Danger|       2.0|
|[66.40015,1223.64...|        Danger|       0.0|
|[46.005398,1091.1...|        Normal|       1.0|
+--------------------+--------------+----------+
only showing top 10 rows



In [57]:
# Filter out rows where prediction is incorrect
misclassified = predictions.filter(predictions["system_warning"] != predictions["prediction"])
misclassified.show(10)

+------------+---------+-----------+---------+--------+----------------+------------+-----------+-------+-------+-------------+-----+------------+-----------------+---------------+------------+--------------------+-----------------------+---------------------+-------------------+--------+-----------+----------------+-----------+-------------+--------------+----+--------------+------------------+----------------+---------------+---------------+--------------+-----------+------------+--------------------+-------------------+----------------+--------------+----------------------+------------------+-------------------+---------------+-------------------+----------------+---------------------------+--------------------------+-------------+-------------------+------------------------+--------------------------+--------------------+----------------------------+----------------------+--------------+--------------------+----------------+--------------------+-----------------+-------------------

In [37]:
# Check distinct values of system_warning and their corresponding labels
predictions.select("system_warning", "label").distinct().show()

+--------------+-----+
+--------------+-----+
|        Danger|  1.0|
|        Normal|  2.0|
+--------------+-----+



In [59]:
# Filter rows where the prediction is "Normal" (assuming 2.0 corresponds to "Normal")
normal_predictions = predictions.filter(predictions["prediction"] == 1.0)

# Show the result
normal_predictions.select("features", "system_warning", "prediction").show(10)


+--------------------+--------------+----------+
+--------------------+--------------+----------+
|[47.79757,1046.27...|        Normal|       1.0|
|[46.005398,1091.1...|        Normal|       1.0|
|[48.049965,1067.7...|        Normal|       1.0|
|[43.988087,899.60...|        Normal|       1.0|
|[56.312733,920.68...|        Normal|       1.0|
|[41.80106,932.265...|        Normal|       1.0|
|[61.451977,911.91...|        Normal|       1.0|
|[53.29162,907.601...|        Normal|       1.0|
|[38.411343,953.38...|        Normal|       1.0|
|[53.78314,1090.20...|        Normal|       1.0|
+--------------------+--------------+----------+
only showing top 10 rows



In [61]:
# Calculate the total number of rows in the dataset
total_rows = train_data.count()

# Group by 'system_warning' and calculate the counts
category_distribution = train_data.groupBy("system_warning").count()

# Calculate the percentages for each category
category_distribution = category_distribution.withColumn(
    "percentage", (category_distribution["count"] / total_rows) * 100
)

# Show the result with both count and percentage
category_distribution.show()

+--------------+-------+------------------+
+--------------+-------+------------------+
|        Danger| 649439|18.540572839035296|
|        Normal| 865538| 24.70989628579887|
+--------------+-------+------------------+



# Installation Date and Current Date #