# Cleaning Approach 2

This approach is a much more comprehensive method compared to Approach 1. This is becuase we utilized PySpark and combines partitioned parquet files to reduce the processing time of the large dataset. 

1. [Imports & Configuration](#imports--configuration) 
2. [Define Directories](#define-the-base-directories-to-read-and-save-the-files)
3. [Initialize Spark](#initialize-the-spark-session)
4. [Loading Reference Tables](#loading-the-reference-table-for-quick-view)
5. [Converting CSV to Parquet](#converting-the-real-dataset-from-csv-to-parquet)
6. [Cleaning Missing Values](#cleaning-missing-values)
7. [Split Data](#split-data)
8. [Scale Data](#scale-features)

# Imports & Configuration

In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
pd.set_option('display.float_format', '{:.2f}'.format)
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
import sys
from pyspark.sql import SparkSession, Window, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when, count, mean, isnan, expr, lit, countDistinct, round, lag, unix_timestamp, lead, explode, collect_list, struct, rand, row_number
from pyspark.sql.types import IntegerType, StringType, StructType, StructField, ArrayType, MapType
import pyspark.pandas as ps
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.functions import vector_to_array
from sklearn.model_selection import train_test_split
from functools import reduce
import json
from tqdm import tqdm



# Define the Base Directories to read and save the files

In [2]:
# Base directory paths
BASE_DIR = "../Data/Cleaning & Preparation/Approach_2"
STAGED_DATA_PATH = os.path.join(BASE_DIR, "Staged Cleaning Data")
RESULTS_PATH = os.path.join(BASE_DIR, "Results")
CLEANED_DATA_PATH = os.path.join(BASE_DIR, "Clean Data")
TRAIN_TEST_DATA_PATH = os.path.join(BASE_DIR, "Train Test Data")
SCALED_DATA_PATH = os.path.join(BASE_DIR, "Train Test (Scaled) Data")

# Initialize the Spark Session

In [3]:
### Initialize Spark Session
try:
    spark = SparkSession.builder \
        .appName("3W Real Dataset") \
        .master("local[*]") \
        .config("spark.driver.memory", "12g") \
        .config("spark.executor.memory", "6g") \
        .config("spark.sql.shuffle.partitions", "16") \
        .config("spark.default.parallelism", "16") \
        .config("spark.sql.parquet.filterPushdown", "true") \
        .config("spark.sql.parquet.enableVectorizedReader", "true") \
        .getOrCreate()
except Exception as e:
    print(f"Error initializing SparkSession: {e}")
    raise

24/12/11 14:25:33 WARN Utils: Your hostname, Rawans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.22.175.61 instead (on interface en0)
24/12/11 14:25:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/11 14:25:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Loading the reference table for quick view

In [4]:
### Load Reference Table
REFERENCE_TABLE_PATH = "../EDA/Results/reference_table_real.csv"
reference_table_missing_values_real = spark.read.csv(REFERENCE_TABLE_PATH, header=True)
reference_table_missing_values_real = reference_table_missing_values_real.drop("Unique Values (Real)", "Missing Values (%) (Real)")
reference_table_missing_values_real.sort("Value Type").show(truncate=False, n=100)

+-------------+------------------------------------------------------------+--------------------+-----------+
|Tag          |Name                                                        |Unit                |Value Type |
+-------------+------------------------------------------------------------+--------------------+-----------+
|ESTADO-DHSV  |State of the DHSV (downhole safety valve)                   |[0, 0.5, 1]         |Categorical|
|ESTADO-M1    |State of the PMV (production master valve)                  |[0, 0.5, 1]         |Categorical|
|ESTADO-PXO   |State of the PXO (pig-crossover) valve                      |[0, 0.5, 1]         |Categorical|
|ESTADO-M2    |State of the AMV (annulus master valve)                     |[0, 0.5, 1]         |Categorical|
|ESTADO-SDV-GL|State of the gas lift SDV (shutdown valve)                  |[0, 0.5, 1]         |Categorical|
|ESTADO-SDV-P |State of the production SDV (shutdown valve)                |[0, 0.5, 1]         |Categorical|
|ESTADO-W1

### Defining the features to be used for the cleaning and the types of features based on the reference table defined in EDA

In [5]:
### Define Features
feature_names = [
    row["Tag"] for row in reference_table_missing_values_real.filter(col("Value Type").isin(["Continuous", "Categorical"])).collect()
    if row["Tag"] != "DataType"
]
features_continuous = [
    row["Tag"] for row in reference_table_missing_values_real.filter(col("Value Type") == "Continuous").collect()
]
features_categorical = [
    row["Tag"] for row in reference_table_missing_values_real.filter(col("Value Type") == "Categorical").collect()
    if row["Tag"] != "DataType"
]

# Converting the Real Dataset from CSV to Parquet

In [6]:
# Conver csv to parquet format
real_instances = spark.read.csv("../Data/3W Original/real_instances.csv", header=True)
real_instances.write.mode("overwrite").parquet("../Data/3W Original/real_instances.parquet")

24/12/11 14:25:39 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

### Reading the Parquet for the real dataset

In [7]:
### Load Original 3W (Real) Dataset
DATA_PATH = "../Data/3W Original/real_instances.parquet"
spark_df_real = spark.read.parquet(DATA_PATH)

# Cleaning Missing Values

In [None]:
### Fill Missing Values by Instance
def clean_missing_values(input_df: DataFrame, reference_table: DataFrame, output_path: str, incomplete_dirs: set = None):
    """
    Cleans missing values for each instance in the dataset by forward and backward filling.

    Parameters:
        input_df (DataFrame): Input Spark DataFrame containing the raw data.
        reference_table (DataFrame): Reference table specifying feature types.
        output_path (str): Directory path to save cleaned instances.
        incomplete_dirs (set): Optional. Set of directories that need reprocessing.

    Returns:
        None
    """
    reference_df = reference_table.select("Tag", "Value Type").distinct()
    relevant_columns = [
        row["Tag"] for row in reference_df.filter(col("Value Type").isin(["Continuous", "Categorical"])).collect()
        if row["Tag"] != "DataType"
    ]
    empty_columns = [row["Tag"] for row in reference_df.filter(col("Value Type") == "Empty").collect()]
    input_df = input_df.drop(*empty_columns)

    unique_instances = input_df.select("Instance", "label").distinct().collect()
    for row in tqdm(unique_instances, desc="Cleaning Instances"):
        instance = row["Instance"]
        label = row["label"]

        label_dir = os.path.join(output_path, f"Label_{label}")
        output_file = os.path.join(label_dir, f"Instance_{instance}.parquet")
        if incomplete_dirs:
            if output_file not in incomplete_dirs:
                continue

        try:
            instance_df = input_df.filter(col("Instance") == instance).orderBy("timestamp")
            total_rows = instance_df.count()
            for column in instance_df.columns:
                if instance_df.filter(col(column).isNull()).count() == total_rows:
                    instance_df = instance_df.drop(column)

            instance_pdf = instance_df.toPandas()
            for column in relevant_columns:
                if column in instance_pdf.columns:
                    instance_pdf[column] = instance_pdf[column].ffill().bfill()

            cleaned_sdf = spark.createDataFrame(instance_pdf)
            os.makedirs(label_dir, exist_ok=True)
            cleaned_sdf.write.mode("overwrite").parquet(output_file)
            print(f"Processed: Label - {label}, Instance - {instance}")

        except Exception as e:
            print(f"Error processing instance: Label - {label}, Instance - {instance}, Error: {type(e).__name__}: {e}")
            continue

    print(f"Cleaning process completed. Data exported to: {output_path}")

# Clean Missing Values
clean_missing_values(spark_df_real, reference_table_missing_values_real, os.path.join(STAGED_DATA_PATH, "Stage 1"))



CodeCache: size=131072Kb used=35110Kb max_used=35164Kb free=95961Kb
 bounds [0x000000010d1f8000, 0x000000010f488000, 0x00000001151f8000]
 total_blobs=13565 nmethods=12533 adapters=943
 compilation: disabled (not enough contiguous free space left)


Cleaning Instances:   0%|          | 1/1119 [00:06<2:06:44,  6.80s/it]          

Processed: Label - 3, Instance - 1078


Cleaning Instances:   0%|          | 2/1119 [00:10<1:28:38,  4.76s/it]

Processed: Label - 2, Instance - 1091


Cleaning Instances:   0%|          | 3/1119 [00:13<1:18:37,  4.23s/it]

Processed: Label - 3, Instance - 1074


Cleaning Instances:   0%|          | 4/1119 [00:17<1:11:44,  3.86s/it]

Processed: Label - 3, Instance - 1079


Cleaning Instances:   0%|          | 5/1119 [00:19<59:05,  3.18s/it]  

Processed: Label - 2, Instance - 1089


Cleaning Instances:   1%|          | 6/1119 [00:21<54:42,  2.95s/it]

Processed: Label - 2, Instance - 1097


Cleaning Instances:   1%|          | 7/1119 [00:25<1:01:52,  3.34s/it]

Processed: Label - 5, Instance - 1108


Cleaning Instances:   1%|          | 8/1119 [00:28<1:00:02,  3.24s/it]

Processed: Label - 3, Instance - 1073


Cleaning Instances:   1%|          | 9/1119 [00:31<59:31,  3.22s/it]  

Processed: Label - 3, Instance - 1071


Cleaning Instances:   1%|          | 10/1119 [00:35<59:49,  3.24s/it]

Processed: Label - 3, Instance - 1075


Cleaning Instances:   1%|          | 11/1119 [00:38<1:00:46,  3.29s/it]

Processed: Label - 3, Instance - 1076


Cleaning Instances:   1%|          | 12/1119 [00:41<56:14,  3.05s/it]  

Processed: Label - 2, Instance - 1096


Cleaning Instances:   1%|          | 13/1119 [00:44<57:45,  3.13s/it]

Processed: Label - 3, Instance - 1082


Cleaning Instances:   1%|▏         | 14/1119 [00:46<54:18,  2.95s/it]

Processed: Label - 2, Instance - 1093


Cleaning Instances:   1%|▏         | 15/1119 [00:48<49:07,  2.67s/it]

Processed: Label - 2, Instance - 1094


Cleaning Instances:   1%|▏         | 16/1119 [00:51<48:31,  2.64s/it]

Processed: Label - 2, Instance - 1092


Cleaning Instances:   2%|▏         | 17/1119 [00:55<55:49,  3.04s/it]

Processed: Label - 3, Instance - 1068


Cleaning Instances:   2%|▏         | 18/1119 [00:58<58:16,  3.18s/it]

Processed: Label - 3, Instance - 1085


Cleaning Instances:   2%|▏         | 19/1119 [01:01<54:18,  2.96s/it]

Processed: Label - 2, Instance - 1087


Cleaning Instances:   2%|▏         | 20/1119 [01:04<52:35,  2.87s/it]

Processed: Label - 2, Instance - 1088


Cleaning Instances:   2%|▏         | 21/1119 [01:07<54:00,  2.95s/it]

Processed: Label - 2, Instance - 1104


Cleaning Instances:   2%|▏         | 22/1119 [01:10<58:27,  3.20s/it]

Processed: Label - 2, Instance - 1106


Cleaning Instances:   2%|▏         | 23/1119 [01:14<57:42,  3.16s/it]

Processed: Label - 2, Instance - 1107


Cleaning Instances:   2%|▏         | 24/1119 [01:18<1:02:16,  3.41s/it]

Processed: Label - 3, Instance - 1083


Cleaning Instances:   2%|▏         | 25/1119 [01:21<1:00:08,  3.30s/it]

Processed: Label - 2, Instance - 1086


Cleaning Instances:   2%|▏         | 26/1119 [01:25<1:04:51,  3.56s/it]

Processed: Label - 3, Instance - 1069


Cleaning Instances:   2%|▏         | 27/1119 [01:29<1:07:49,  3.73s/it]

Processed: Label - 3, Instance - 1070


Cleaning Instances:   3%|▎         | 28/1119 [01:33<1:10:31,  3.88s/it]

Processed: Label - 3, Instance - 1077


Cleaning Instances:   3%|▎         | 29/1119 [01:37<1:11:44,  3.95s/it]

Processed: Label - 3, Instance - 1081


Cleaning Instances:   3%|▎         | 30/1119 [01:40<1:06:27,  3.66s/it]

Processed: Label - 2, Instance - 1090


Cleaning Instances:   3%|▎         | 31/1119 [01:46<1:17:24,  4.27s/it]

Processed: Label - 5, Instance - 1109


Cleaning Instances:   3%|▎         | 32/1119 [01:49<1:12:33,  4.01s/it]

Processed: Label - 5, Instance - 1110


Cleaning Instances:   3%|▎         | 33/1119 [01:55<1:20:45,  4.46s/it]

Processed: Label - 5, Instance - 1111


Cleaning Instances:   3%|▎         | 34/1119 [01:59<1:17:39,  4.29s/it]

Processed: Label - 3, Instance - 1072


Cleaning Instances:   3%|▎         | 35/1119 [02:02<1:10:06,  3.88s/it]

Processed: Label - 2, Instance - 1100


Cleaning Instances:   3%|▎         | 36/1119 [02:05<1:04:59,  3.60s/it]

Processed: Label - 2, Instance - 1101


Cleaning Instances:   3%|▎         | 37/1119 [02:08<1:01:57,  3.44s/it]

Processed: Label - 2, Instance - 1105


Cleaning Instances:   3%|▎         | 38/1119 [02:12<1:06:22,  3.68s/it]

Processed: Label - 3, Instance - 1084


Cleaning Instances:   3%|▎         | 39/1119 [02:15<1:02:45,  3.49s/it]

Processed: Label - 2, Instance - 1098


Cleaning Instances:   4%|▎         | 40/1119 [02:18<1:03:09,  3.51s/it]

Processed: Label - 2, Instance - 1099


Cleaning Instances:   4%|▎         | 41/1119 [02:22<1:04:12,  3.57s/it]

Processed: Label - 2, Instance - 1102


Cleaning Instances:   4%|▍         | 42/1119 [02:25<1:01:36,  3.43s/it]

Processed: Label - 2, Instance - 1103


Cleaning Instances:   4%|▍         | 43/1119 [02:29<1:04:05,  3.57s/it]

Processed: Label - 3, Instance - 1080


Cleaning Instances:   4%|▍         | 44/1119 [02:33<1:03:07,  3.52s/it]

Processed: Label - 2, Instance - 1095


Cleaning Instances:   4%|▍         | 45/1119 [02:39<1:17:43,  4.34s/it]

Processed: Label - 9, Instance - 24


Cleaning Instances:   4%|▍         | 46/1119 [02:43<1:14:23,  4.16s/it]

Processed: Label - 9, Instance - 39


Cleaning Instances:   4%|▍         | 47/1119 [02:46<1:12:42,  4.07s/it]

Processed: Label - 9, Instance - 25


Cleaning Instances:   4%|▍         | 48/1119 [02:52<1:20:35,  4.52s/it]

Processed: Label - 9, Instance - 28


Cleaning Instances:   4%|▍         | 49/1119 [02:56<1:19:11,  4.44s/it]

Processed: Label - 9, Instance - 36


Cleaning Instances:   4%|▍         | 50/1119 [03:02<1:23:35,  4.69s/it]

Processed: Label - 9, Instance - 29


Cleaning Instances:   5%|▍         | 51/1119 [03:07<1:30:10,  5.07s/it]

Processed: Label - 9, Instance - 38


Cleaning Instances:   5%|▍         | 52/1119 [03:11<1:23:24,  4.69s/it]

Processed: Label - 9, Instance - 14


Cleaning Instances:   5%|▍         | 53/1119 [03:18<1:32:42,  5.22s/it]         

Processed: Label - 9, Instance - 40


24/12/11 14:29:52 WARN TaskSetManager: Stage 5524 contains a task of very large size (1460 KiB). The maximum recommended task size is 1000 KiB.
Cleaning Instances:   5%|▍         | 54/1119 [03:33<2:24:31,  8.14s/it]         

Processed: Label - 9, Instance - 43


Cleaning Instances:   5%|▍         | 55/1119 [03:38<2:07:46,  7.21s/it]

Processed: Label - 9, Instance - 26


Cleaning Instances:   5%|▌         | 56/1119 [03:43<1:57:13,  6.62s/it]         

Processed: Label - 9, Instance - 33


Cleaning Instances:   5%|▌         | 57/1119 [03:47<1:43:38,  5.86s/it]

Processed: Label - 9, Instance - 16


Cleaning Instances:   5%|▌         | 58/1119 [03:50<1:25:34,  4.84s/it]

Processed: Label - 9, Instance - 17


Cleaning Instances:   5%|▌         | 59/1119 [03:56<1:33:43,  5.31s/it]

Processed: Label - 9, Instance - 19


Cleaning Instances:   5%|▌         | 60/1119 [04:02<1:36:00,  5.44s/it]         

Processed: Label - 9, Instance - 41


Cleaning Instances:   5%|▌         | 61/1119 [04:07<1:33:09,  5.28s/it]

Processed: Label - 9, Instance - 15


Cleaning Instances:   6%|▌         | 62/1119 [04:11<1:31:03,  5.17s/it]

Processed: Label - 9, Instance - 22


Cleaning Instances:   6%|▌         | 63/1119 [04:15<1:20:42,  4.59s/it]

Processed: Label - 9, Instance - 35


Cleaning Instances:   6%|▌         | 64/1119 [04:17<1:07:59,  3.87s/it]

Processed: Label - 9, Instance - 21


Cleaning Instances:   6%|▌         | 65/1119 [04:19<1:00:31,  3.45s/it]

Processed: Label - 9, Instance - 27


Cleaning Instances:   6%|▌         | 66/1119 [04:21<51:18,  2.92s/it]  

Processed: Label - 9, Instance - 37


Cleaning Instances:   6%|▌         | 67/1119 [04:26<1:02:36,  3.57s/it]

Processed: Label - 9, Instance - 31


Cleaning Instances:   6%|▌         | 68/1119 [04:30<1:01:58,  3.54s/it]

Processed: Label - 9, Instance - 32


Cleaning Instances:   6%|▌         | 69/1119 [04:36<1:14:53,  4.28s/it]

Processed: Label - 9, Instance - 18


Cleaning Instances:   6%|▋         | 70/1119 [04:38<1:03:43,  3.65s/it]

Processed: Label - 9, Instance - 30


Cleaning Instances:   6%|▋         | 71/1119 [04:40<56:10,  3.22s/it]  

Processed: Label - 9, Instance - 42


Cleaning Instances:   6%|▋         | 72/1119 [04:43<54:49,  3.14s/it]

Processed: Label - 9, Instance - 20


Cleaning Instances:   7%|▋         | 73/1119 [04:47<57:03,  3.27s/it]

Processed: Label - 9, Instance - 23


Cleaning Instances:   7%|▋         | 74/1119 [04:49<54:37,  3.14s/it]

Processed: Label - 9, Instance - 34


Cleaning Instances:   7%|▋         | 75/1119 [04:57<1:15:56,  4.36s/it]

Processed: Label - 9, Instance - 0


Cleaning Instances:   7%|▋         | 76/1119 [05:02<1:20:54,  4.65s/it]

Processed: Label - 9, Instance - 8


Cleaning Instances:   7%|▋         | 77/1119 [05:05<1:14:51,  4.31s/it]

Processed: Label - 9, Instance - 10


Cleaning Instances:   7%|▋         | 78/1119 [05:10<1:16:14,  4.39s/it]

Processed: Label - 9, Instance - 6


Cleaning Instances:   7%|▋         | 79/1119 [05:15<1:18:16,  4.52s/it]

Processed: Label - 9, Instance - 4


Cleaning Instances:   7%|▋         | 80/1119 [05:19<1:15:03,  4.33s/it]

Processed: Label - 9, Instance - 5


24/12/11 14:31:46 WARN TaskSetManager: Stage 8278 contains a task of very large size (1095 KiB). The maximum recommended task size is 1000 KiB.
Cleaning Instances:   7%|▋         | 81/1119 [05:26<1:27:58,  5.09s/it]         

Processed: Label - 9, Instance - 3


Cleaning Instances:   7%|▋         | 82/1119 [05:33<1:39:22,  5.75s/it]         

Processed: Label - 9, Instance - 9


Cleaning Instances:   7%|▋         | 83/1119 [05:39<1:39:23,  5.76s/it]

Processed: Label - 9, Instance - 1


Cleaning Instances:   8%|▊         | 84/1119 [05:43<1:29:43,  5.20s/it]

Processed: Label - 9, Instance - 12


Cleaning Instances:   8%|▊         | 85/1119 [05:48<1:32:39,  5.38s/it]

Processed: Label - 9, Instance - 7


Cleaning Instances:   8%|▊         | 86/1119 [05:55<1:40:27,  5.84s/it]

Processed: Label - 9, Instance - 13


24/12/11 14:32:36 WARN TaskSetManager: Stage 8890 contains a task of very large size (2640 KiB). The maximum recommended task size is 1000 KiB.
Cleaning Instances:   8%|▊         | 87/1119 [06:17<3:00:24, 10.49s/it]         

Processed: Label - 9, Instance - 2


Cleaning Instances:   8%|▊         | 88/1119 [06:20<2:23:26,  8.35s/it]

Processed: Label - 9, Instance - 11


24/12/11 14:33:32 WARN TaskSetManager: Stage 9094 contains a task of very large size (8150 KiB). The maximum recommended task size is 1000 KiB.
Cleaning Instances:   8%|▊         | 89/1119 [07:18<6:36:43, 23.11s/it]         

Processed: Label - 8, Instance - 710


Cleaning Instances:   8%|▊         | 90/1119 [07:22<5:00:12, 17.51s/it]

Processed: Label - 4, Instance - 1029


Cleaning Instances:   8%|▊         | 91/1119 [07:25<3:45:07, 13.14s/it]

Processed: Label - 4, Instance - 1048


Cleaning Instances:   8%|▊         | 92/1119 [07:29<2:58:39, 10.44s/it]

Processed: Label - 3, Instance - 1057


24/12/11 14:34:35 WARN TaskSetManager: Stage 9502 contains a task of very large size (7218 KiB). The maximum recommended task size is 1000 KiB.
Cleaning Instances:   8%|▊         | 93/1119 [08:21<6:32:58, 22.98s/it]         

Processed: Label - 8, Instance - 707


Cleaning Instances:   8%|▊         | 94/1119 [08:27<5:03:41, 17.78s/it]

Processed: Label - 4, Instance - 1030


Cleaning Instances:   8%|▊         | 95/1119 [08:30<3:49:52, 13.47s/it]

Processed: Label - 4, Instance - 1046


Cleaning Instances:   9%|▊         | 96/1119 [08:34<3:00:01, 10.56s/it]

Processed: Label - 4, Instance - 1051


Cleaning Instances:   9%|▊         | 97/1119 [08:38<2:27:56,  8.69s/it]

Processed: Label - 4, Instance - 1015


Cleaning Instances:   9%|▉         | 98/1119 [08:43<2:05:30,  7.38s/it]

Processed: Label - 3, Instance - 1056


Cleaning Instances:   9%|▉         | 99/1119 [08:46<1:42:26,  6.03s/it]

Processed: Label - 4, Instance - 1008


Cleaning Instances:   9%|▉         | 100/1119 [08:48<1:25:19,  5.02s/it]

Processed: Label - 4, Instance - 1012


Cleaning Instances:   9%|▉         | 101/1119 [08:51<1:13:37,  4.34s/it]

Processed: Label - 4, Instance - 1020


Cleaning Instances:   9%|▉         | 102/1119 [08:54<1:05:18,  3.85s/it]

Processed: Label - 4, Instance - 1021


Cleaning Instances:   9%|▉         | 103/1119 [08:56<58:12,  3.44s/it]  

Processed: Label - 4, Instance - 1034


Cleaning Instances:   9%|▉         | 104/1119 [08:59<53:38,  3.17s/it]

Processed: Label - 4, Instance - 1053


Cleaning Instances:   9%|▉         | 105/1119 [09:02<53:36,  3.17s/it]

Processed: Label - 3, Instance - 1058


Cleaning Instances:   9%|▉         | 106/1119 [09:06<57:04,  3.38s/it]

Processed: Label - 3, Instance - 1067


24/12/11 14:35:39 WARN TaskSetManager: Stage 10930 contains a task of very large size (1803 KiB). The maximum recommended task size is 1000 KiB.
Cleaning Instances:  10%|▉         | 107/1119 [09:19<1:46:33,  6.32s/it]        

Processed: Label - 8, Instance - 708


Cleaning Instances:  10%|▉         | 108/1119 [09:22<1:30:20,  5.36s/it]

Processed: Label - 4, Instance - 1007


Cleaning Instances:  10%|▉         | 109/1119 [09:25<1:17:06,  4.58s/it]

Processed: Label - 4, Instance - 1016


Cleaning Instances:  10%|▉         | 110/1119 [09:28<1:11:04,  4.23s/it]

Processed: Label - 4, Instance - 1017


Cleaning Instances:  10%|▉         | 111/1119 [09:31<1:04:32,  3.84s/it]

Processed: Label - 4, Instance - 1036


Cleaning Instances:  10%|█         | 112/1119 [09:36<1:11:15,  4.25s/it]        

Processed: Label - 4, Instance - 1047


Cleaning Instances:  10%|█         | 113/1119 [09:41<1:12:00,  4.29s/it]

Processed: Label - 4, Instance - 1040


Cleaning Instances:  10%|█         | 114/1119 [09:45<1:11:53,  4.29s/it]

Processed: Label - 4, Instance - 1042


Cleaning Instances:  10%|█         | 115/1119 [09:52<1:23:18,  4.98s/it]        

Processed: Label - 3, Instance - 1060


Cleaning Instances:  10%|█         | 116/1119 [09:57<1:23:41,  5.01s/it]

Processed: Label - 4, Instance - 1010


Cleaning Instances:  10%|█         | 117/1119 [10:01<1:22:09,  4.92s/it]

Processed: Label - 4, Instance - 1033


Cleaning Instances:  11%|█         | 118/1119 [10:05<1:17:06,  4.62s/it]

Processed: Label - 4, Instance - 1045


Cleaning Instances:  11%|█         | 119/1119 [10:11<1:19:29,  4.77s/it]

Processed: Label - 3, Instance - 1055


Cleaning Instances:  11%|█         | 120/1119 [10:14<1:13:11,  4.40s/it]

Processed: Label - 4, Instance - 1009


In [None]:
### Transform Class Column to Binary Classification
def transform_class_column_in_cleaned_data(base_path, output_path):
    """
    Transforms the 'class' column into a binary classification column ('target').

    Parameters:
        base_path (str): Directory containing the cleaned instances.
        output_path (str): Directory path to save transformed instances.

    Returns:
        None
    """
    for label_dir in tqdm(os.listdir(base_path), desc="Transforming Class Column"):
        label_path = os.path.join(base_path, label_dir)
        if not os.path.isdir(label_path):
            continue

        for instance_dir in os.listdir(label_path):
            instance_path = os.path.join(label_path, instance_dir)
            if not instance_path.endswith(".parquet"):
                continue

            try:
                instance_df = spark.read.parquet(instance_path)
                if "class" not in instance_df.columns:
                    print(f"Skipping instance {instance_dir}: 'class' column not found.")
                    continue

                updated_df = instance_df.withColumn(
                    "target",
                    when(col("class").isin(None, 1, 2, 5, 6, 7, 8, 9), 0)
                    .when(col("class").isin(3, 4, 101, 102, 105, 106, 107, 108, 109), 1)
                    .otherwise(lit(0))
                )

                output_label_dir = os.path.join(output_path, label_dir)
                os.makedirs(output_label_dir, exist_ok=True)
                output_file = os.path.join(output_label_dir, instance_dir)
                updated_df.write.mode("overwrite").parquet(output_file)
                print(f"Processed: Label - {label_dir}, Instance - {instance_dir}")

            except Exception as e:
                print(f"Error processing {instance_path}: {e}")
                continue

    print(f"Processed data saved to: {output_path}")

# Transform Class Column
transform_class_column_in_cleaned_data(os.path.join(STAGED_DATA_PATH, "Stage 1"), os.path.join(STAGED_DATA_PATH, "Stage 2"))

Transforming Class Column:   0%|          | 0/10 [00:00<?, ?it/s]

Processed: Label - Label_9, Instance - Instance_11.parquet
Processed: Label - Label_9, Instance - Instance_5.parquet
Processed: Label - Label_9, Instance - Instance_18.parquet
Processed: Label - Label_9, Instance - Instance_45.parquet
Processed: Label - Label_9, Instance - Instance_55.parquet
Processed: Label - Label_9, Instance - Instance_27.parquet
Processed: Label - Label_9, Instance - Instance_37.parquet
Processed: Label - Label_9, Instance - Instance_19.parquet
Processed: Label - Label_9, Instance - Instance_54.parquet
Processed: Label - Label_9, Instance - Instance_44.parquet
Processed: Label - Label_9, Instance - Instance_36.parquet
Processed: Label - Label_9, Instance - Instance_26.parquet


                                                                                

Processed: Label - Label_9, Instance - Instance_10.parquet
Processed: Label - Label_9, Instance - Instance_4.parquet
Processed: Label - Label_9, Instance - Instance_24.parquet
Processed: Label - Label_9, Instance - Instance_34.parquet
Processed: Label - Label_9, Instance - Instance_46.parquet
Processed: Label - Label_9, Instance - Instance_56.parquet
Processed: Label - Label_9, Instance - Instance_6.parquet
Processed: Label - Label_9, Instance - Instance_12.parquet
Processed: Label - Label_9, Instance - Instance_7.parquet
Processed: Label - Label_9, Instance - Instance_13.parquet
Processed: Label - Label_9, Instance - Instance_35.parquet
Processed: Label - Label_9, Instance - Instance_25.parquet
Processed: Label - Label_9, Instance - Instance_47.parquet
Processed: Label - Label_9, Instance - Instance_20.parquet
Processed: Label - Label_9, Instance - Instance_30.parquet
Processed: Label - Label_9, Instance - Instance_42.parquet
Processed: Label - Label_9, Instance - Instance_52.parquet


                                                                                

Processed: Label - Label_9, Instance - Instance_2.parquet
Processed: Label - Label_9, Instance - Instance_16.parquet
Processed: Label - Label_9, Instance - Instance_28.parquet
Processed: Label - Label_9, Instance - Instance_38.parquet
Processed: Label - Label_9, Instance - Instance_3.parquet
Processed: Label - Label_9, Instance - Instance_17.parquet
Processed: Label - Label_9, Instance - Instance_31.parquet
Processed: Label - Label_9, Instance - Instance_21.parquet
Processed: Label - Label_9, Instance - Instance_53.parquet
Processed: Label - Label_9, Instance - Instance_43.parquet
Processed: Label - Label_9, Instance - Instance_48.parquet
Processed: Label - Label_9, Instance - Instance_15.parquet
Processed: Label - Label_9, Instance - Instance_1.parquet
Processed: Label - Label_9, Instance - Instance_41.parquet
Processed: Label - Label_9, Instance - Instance_51.parquet
Processed: Label - Label_9, Instance - Instance_23.parquet
Processed: Label - Label_9, Instance - Instance_33.parquet


Transforming Class Column:  10%|█         | 1/10 [00:37<05:39, 37.70s/it]

Processed: Label - Label_9, Instance - Instance_0.parquet
Processed: Label - Label_7, Instance - Instance_653.parquet
Processed: Label - Label_7, Instance - Instance_675.parquet
Processed: Label - Label_7, Instance - Instance_665.parquet
Processed: Label - Label_7, Instance - Instance_681.parquet


                                                                                

Processed: Label - Label_7, Instance - Instance_664.parquet
Processed: Label - Label_7, Instance - Instance_674.parquet
Processed: Label - Label_7, Instance - Instance_680.parquet
Processed: Label - Label_7, Instance - Instance_652.parquet
Processed: Label - Label_7, Instance - Instance_676.parquet
Processed: Label - Label_7, Instance - Instance_666.parquet
Processed: Label - Label_7, Instance - Instance_682.parquet
Processed: Label - Label_7, Instance - Instance_659.parquet
Processed: Label - Label_7, Instance - Instance_651.parquet
Processed: Label - Label_7, Instance - Instance_667.parquet
Processed: Label - Label_7, Instance - Instance_677.parquet


                                                                                

Processed: Label - Label_7, Instance - Instance_683.parquet


                                                                                

Processed: Label - Label_7, Instance - Instance_658.parquet
Processed: Label - Label_7, Instance - Instance_686.parquet
Processed: Label - Label_7, Instance - Instance_672.parquet


                                                                                

Processed: Label - Label_7, Instance - Instance_662.parquet
Processed: Label - Label_7, Instance - Instance_654.parquet
Processed: Label - Label_7, Instance - Instance_655.parquet


                                                                                

Processed: Label - Label_7, Instance - Instance_663.parquet


                                                                                

Processed: Label - Label_7, Instance - Instance_673.parquet
Processed: Label - Label_7, Instance - Instance_657.parquet
Processed: Label - Label_7, Instance - Instance_668.parquet


                                                                                

Processed: Label - Label_7, Instance - Instance_678.parquet
Processed: Label - Label_7, Instance - Instance_685.parquet


                                                                                

Processed: Label - Label_7, Instance - Instance_671.parquet


                                                                                

Processed: Label - Label_7, Instance - Instance_661.parquet
Processed: Label - Label_7, Instance - Instance_684.parquet
Processed: Label - Label_7, Instance - Instance_660.parquet
Processed: Label - Label_7, Instance - Instance_670.parquet


                                                                                

Processed: Label - Label_7, Instance - Instance_656.parquet
Processed: Label - Label_7, Instance - Instance_679.parquet


Transforming Class Column:  20%|██        | 2/10 [01:06<04:21, 32.73s/it]

Processed: Label - Label_7, Instance - Instance_669.parquet
Processed: Label - Label_0, Instance - Instance_245.parquet
Processed: Label - Label_0, Instance - Instance_255.parquet
Processed: Label - Label_0, Instance - Instance_410.parquet
Processed: Label - Label_0, Instance - Instance_568.parquet
Processed: Label - Label_0, Instance - Instance_400.parquet
Processed: Label - Label_0, Instance - Instance_578.parquet
Processed: Label - Label_0, Instance - Instance_123.parquet
Processed: Label - Label_0, Instance - Instance_133.parquet
Processed: Label - Label_0, Instance - Instance_384.parquet
Processed: Label - Label_0, Instance - Instance_394.parquet
Processed: Label - Label_0, Instance - Instance_141.parquet
Processed: Label - Label_0, Instance - Instance_151.parquet
Processed: Label - Label_0, Instance - Instance_63.parquet
Processed: Label - Label_0, Instance - Instance_73.parquet
Processed: Label - Label_0, Instance - Instance_472.parquet
Processed: Label - Label_0, Instance - Ins

                                                                                

Processed: Label - Label_0, Instance - Instance_454.parquet
Processed: Label - Label_0, Instance - Instance_444.parquet
Processed: Label - Label_0, Instance - Instance_201.parquet
Processed: Label - Label_0, Instance - Instance_379.parquet
Processed: Label - Label_0, Instance - Instance_211.parquet
Processed: Label - Label_0, Instance - Instance_369.parquet
Processed: Label - Label_0, Instance - Instance_167.parquet
Processed: Label - Label_0, Instance - Instance_177.parquet
Processed: Label - Label_0, Instance - Instance_595.parquet
Processed: Label - Label_0, Instance - Instance_585.parquet
Processed: Label - Label_0, Instance - Instance_628.parquet
Processed: Label - Label_0, Instance - Instance_638.parquet
Processed: Label - Label_0, Instance - Instance_105.parquet
Processed: Label - Label_0, Instance - Instance_115.parquet
Processed: Label - Label_0, Instance - Instance_263.parquet
Processed: Label - Label_0, Instance - Instance_273.parquet
Processed: Label - Label_0, Instance - I

                                                                                

Processed: Label - Label_0, Instance - Instance_570.parquet
Processed: Label - Label_0, Instance - Instance_408.parquet
Processed: Label - Label_0, Instance - Instance_335.parquet
Processed: Label - Label_0, Instance - Instance_325.parquet
Processed: Label - Label_0, Instance - Instance_606.parquet
Processed: Label - Label_0, Instance - Instance_616.parquet
Processed: Label - Label_0, Instance - Instance_584.parquet
Processed: Label - Label_0, Instance - Instance_594.parquet
Processed: Label - Label_0, Instance - Instance_368.parquet
Processed: Label - Label_0, Instance - Instance_210.parquet
Processed: Label - Label_0, Instance - Instance_378.parquet
Processed: Label - Label_0, Instance - Instance_200.parquet
Processed: Label - Label_0, Instance - Instance_445.parquet
Processed: Label - Label_0, Instance - Instance_455.parquet
Processed: Label - Label_0, Instance - Instance_176.parquet
Processed: Label - Label_0, Instance - Instance_166.parquet
Processed: Label - Label_0, Instance - I

                                                                                

Processed: Label - Label_0, Instance - Instance_642.parquet
Processed: Label - Label_0, Instance - Instance_180.parquet
Processed: Label - Label_0, Instance - Instance_190.parquet
Processed: Label - Label_0, Instance - Instance_614.parquet
Processed: Label - Label_0, Instance - Instance_604.parquet
Processed: Label - Label_0, Instance - Instance_139.parquet
Processed: Label - Label_0, Instance - Instance_129.parquet
Processed: Label - Label_0, Instance - Instance_327.parquet
Processed: Label - Label_0, Instance - Instance_337.parquet
Processed: Label - Label_0, Instance - Instance_572.parquet


                                                                                

Processed: Label - Label_0, Instance - Instance_562.parquet
Processed: Label - Label_0, Instance - Instance_510.parquet
Processed: Label - Label_0, Instance - Instance_468.parquet
Processed: Label - Label_0, Instance - Instance_500.parquet
Processed: Label - Label_0, Instance - Instance_478.parquet
Processed: Label - Label_0, Instance - Instance_345.parquet
Processed: Label - Label_0, Instance - Instance_355.parquet
Processed: Label - Label_0, Instance - Instance_79.parquet
Processed: Label - Label_0, Instance - Instance_69.parquet


                                                                                

Processed: Label - Label_0, Instance - Instance_284.parquet
Processed: Label - Label_0, Instance - Instance_294.parquet
Processed: Label - Label_0, Instance - Instance_318.parquet
Processed: Label - Label_0, Instance - Instance_260.parquet
Processed: Label - Label_0, Instance - Instance_308.parquet
Processed: Label - Label_0, Instance - Instance_270.parquet
Processed: Label - Label_0, Instance - Instance_435.parquet
Processed: Label - Label_0, Instance - Instance_425.parquet
Processed: Label - Label_0, Instance - Instance_106.parquet
Processed: Label - Label_0, Instance - Instance_116.parquet
Processed: Label - Label_0, Instance - Instance_596.parquet
Processed: Label - Label_0, Instance - Instance_586.parquet
Processed: Label - Label_0, Instance - Instance_164.parquet
Processed: Label - Label_0, Instance - Instance_174.parquet
Processed: Label - Label_0, Instance - Instance_649.parquet
Processed: Label - Label_0, Instance - Instance_457.parquet
Processed: Label - Label_0, Instance - I

                                                                                

Processed: Label - Label_0, Instance - Instance_212.parquet
Processed: Label - Label_0, Instance - Instance_509.parquet
Processed: Label - Label_0, Instance - Instance_471.parquet
Processed: Label - Label_0, Instance - Instance_519.parquet
Processed: Label - Label_0, Instance - Instance_461.parquet
Processed: Label - Label_0, Instance - Instance_224.parquet
Processed: Label - Label_0, Instance - Instance_234.parquet
Processed: Label - Label_0, Instance - Instance_142.parquet
Processed: Label - Label_0, Instance - Instance_152.parquet
Processed: Label - Label_0, Instance - Instance_60.parquet
Processed: Label - Label_0, Instance - Instance_70.parquet
Processed: Label - Label_0, Instance - Instance_387.parquet


                                                                                

Processed: Label - Label_0, Instance - Instance_397.parquet
Processed: Label - Label_0, Instance - Instance_199.parquet
Processed: Label - Label_0, Instance - Instance_189.parquet
Processed: Label - Label_0, Instance - Instance_120.parquet
Processed: Label - Label_0, Instance - Instance_130.parquet
Processed: Label - Label_0, Instance - Instance_246.parquet
Processed: Label - Label_0, Instance - Instance_256.parquet
Processed: Label - Label_0, Instance - Instance_413.parquet
Processed: Label - Label_0, Instance - Instance_403.parquet
Processed: Label - Label_0, Instance - Instance_650.parquet


                                                                                

Processed: Label - Label_0, Instance - Instance_640.parquet
Processed: Label - Label_0, Instance - Instance_536.parquet
Processed: Label - Label_0, Instance - Instance_526.parquet
Processed: Label - Label_0, Instance - Instance_363.parquet
Processed: Label - Label_0, Instance - Instance_373.parquet
Processed: Label - Label_0, Instance - Instance_301.parquet
Processed: Label - Label_0, Instance - Instance_279.parquet
Processed: Label - Label_0, Instance - Instance_311.parquet
Processed: Label - Label_0, Instance - Instance_269.parquet
Processed: Label - Label_0, Instance - Instance_554.parquet


                                                                                

Processed: Label - Label_0, Instance - Instance_544.parquet
Processed: Label - Label_0, Instance - Instance_632.parquet
Processed: Label - Label_0, Instance - Instance_622.parquet
Processed: Label - Label_0, Instance - Instance_84.parquet
Processed: Label - Label_0, Instance - Instance_94.parquet
Processed: Label - Label_0, Instance - Instance_495.parquet
Processed: Label - Label_0, Instance - Instance_485.parquet
Processed: Label - Label_0, Instance - Instance_235.parquet
Processed: Label - Label_0, Instance - Instance_225.parquet
Processed: Label - Label_0, Instance - Instance_460.parquet


                                                                                

Processed: Label - Label_0, Instance - Instance_518.parquet
Processed: Label - Label_0, Instance - Instance_470.parquet
Processed: Label - Label_0, Instance - Instance_508.parquet
Processed: Label - Label_0, Instance - Instance_71.parquet
Processed: Label - Label_0, Instance - Instance_61.parquet
Processed: Label - Label_0, Instance - Instance_153.parquet
Processed: Label - Label_0, Instance - Instance_143.parquet
Processed: Label - Label_0, Instance - Instance_131.parquet
Processed: Label - Label_0, Instance - Instance_121.parquet


                                                                                

Processed: Label - Label_0, Instance - Instance_402.parquet
Processed: Label - Label_0, Instance - Instance_412.parquet
Processed: Label - Label_0, Instance - Instance_257.parquet
Processed: Label - Label_0, Instance - Instance_247.parquet
Processed: Label - Label_0, Instance - Instance_396.parquet
Processed: Label - Label_0, Instance - Instance_386.parquet
Processed: Label - Label_0, Instance - Instance_188.parquet
Processed: Label - Label_0, Instance - Instance_198.parquet
Processed: Label - Label_0, Instance - Instance_641.parquet
Processed: Label - Label_0, Instance - Instance_372.parquet
Processed: Label - Label_0, Instance - Instance_362.parquet
Processed: Label - Label_0, Instance - Instance_527.parquet
Processed: Label - Label_0, Instance - Instance_537.parquet
Processed: Label - Label_0, Instance - Instance_95.parquet
Processed: Label - Label_0, Instance - Instance_85.parquet
Processed: Label - Label_0, Instance - Instance_484.parquet
Processed: Label - Label_0, Instance - Ins

                                                                                

Processed: Label - Label_0, Instance - Instance_310.parquet
Processed: Label - Label_0, Instance - Instance_278.parquet
Processed: Label - Label_0, Instance - Instance_300.parquet
Processed: Label - Label_0, Instance - Instance_623.parquet
Processed: Label - Label_0, Instance - Instance_633.parquet
Processed: Label - Label_0, Instance - Instance_128.parquet
Processed: Label - Label_0, Instance - Instance_138.parquet
Processed: Label - Label_0, Instance - Instance_605.parquet
Processed: Label - Label_0, Instance - Instance_615.parquet
Processed: Label - Label_0, Instance - Instance_563.parquet
Processed: Label - Label_0, Instance - Instance_573.parquet
Processed: Label - Label_0, Instance - Instance_336.parquet
Processed: Label - Label_0, Instance - Instance_326.parquet
Processed: Label - Label_0, Instance - Instance_191.parquet
Processed: Label - Label_0, Instance - Instance_181.parquet
Processed: Label - Label_0, Instance - Instance_295.parquet
Processed: Label - Label_0, Instance - I

                                                                                

Processed: Label - Label_0, Instance - Instance_469.parquet
Processed: Label - Label_0, Instance - Instance_511.parquet
Processed: Label - Label_0, Instance - Instance_68.parquet
Processed: Label - Label_0, Instance - Instance_78.parquet
Processed: Label - Label_0, Instance - Instance_424.parquet
Processed: Label - Label_0, Instance - Instance_434.parquet
Processed: Label - Label_0, Instance - Instance_271.parquet
Processed: Label - Label_0, Instance - Instance_309.parquet
Processed: Label - Label_0, Instance - Instance_261.parquet
Processed: Label - Label_0, Instance - Instance_319.parquet
Processed: Label - Label_0, Instance - Instance_117.parquet
Processed: Label - Label_0, Instance - Instance_107.parquet
Processed: Label - Label_0, Instance - Instance_57.parquet
Processed: Label - Label_0, Instance - Instance_648.parquet
Processed: Label - Label_0, Instance - Instance_175.parquet
Processed: Label - Label_0, Instance - Instance_165.parquet
Processed: Label - Label_0, Instance - Inst

                                                                                

Processed: Label - Label_0, Instance - Instance_366.parquet
Processed: Label - Label_0, Instance - Instance_168.parquet
Processed: Label - Label_0, Instance - Instance_178.parquet
Processed: Label - Label_0, Instance - Instance_645.parquet
Processed: Label - Label_0, Instance - Instance_627.parquet
Processed: Label - Label_0, Instance - Instance_637.parquet
Processed: Label - Label_0, Instance - Instance_314.parquet
Processed: Label - Label_0, Instance - Instance_304.parquet
Processed: Label - Label_0, Instance - Instance_541.parquet
Processed: Label - Label_0, Instance - Instance_439.parquet
Processed: Label - Label_0, Instance - Instance_551.parquet
Processed: Label - Label_0, Instance - Instance_429.parquet
Processed: Label - Label_0, Instance - Instance_480.parquet
Processed: Label - Label_0, Instance - Instance_490.parquet
Processed: Label - Label_0, Instance - Instance_91.parquet
Processed: Label - Label_0, Instance - Instance_81.parquet
Processed: Label - Label_0, Instance - Ins

                                                                                

Processed: Label - Label_0, Instance - Instance_552.parquet
Processed: Label - Label_0, Instance - Instance_624.parquet
Processed: Label - Label_0, Instance - Instance_634.parquet
Processed: Label - Label_0, Instance - Instance_109.parquet
Processed: Label - Label_0, Instance - Instance_119.parquet
Processed: Label - Label_0, Instance - Instance_646.parquet
Processed: Label - Label_0, Instance - Instance_59.parquet
Processed: Label - Label_0, Instance - Instance_458.parquet
Processed: Label - Label_0, Instance - Instance_520.parquet
Processed: Label - Label_0, Instance - Instance_448.parquet
Processed: Label - Label_0, Instance - Instance_530.parquet
Processed: Label - Label_0, Instance - Instance_375.parquet
Processed: Label - Label_0, Instance - Instance_365.parquet
Processed: Label - Label_0, Instance - Instance_599.parquet
Processed: Label - Label_0, Instance - Instance_589.parquet
Processed: Label - Label_0, Instance - Instance_136.parquet
Processed: Label - Label_0, Instance - In

Transforming Class Column:  30%|███       | 3/10 [07:13<21:37, 185.33s/it]

Processed: Label - Label_0, Instance - Instance_66.parquet
Processed: Label - Label_1, Instance - Instance_693.parquet
Processed: Label - Label_1, Instance - Instance_696.parquet
Processed: Label - Label_1, Instance - Instance_695.parquet


Transforming Class Column:  40%|████      | 4/10 [07:16<11:18, 113.11s/it]

Processed: Label - Label_1, Instance - Instance_694.parquet
Processed: Label - Label_6, Instance - Instance_688.parquet
Processed: Label - Label_6, Instance - Instance_691.parquet
Processed: Label - Label_6, Instance - Instance_690.parquet
Processed: Label - Label_6, Instance - Instance_689.parquet
Processed: Label - Label_6, Instance - Instance_692.parquet


Transforming Class Column:  50%|█████     | 5/10 [07:19<06:07, 73.54s/it] 

Processed: Label - Label_6, Instance - Instance_687.parquet


                                                                                

Processed: Label - Label_8, Instance - Instance_704.parquet
Processed: Label - Label_8, Instance - Instance_698.parquet
Processed: Label - Label_8, Instance - Instance_705.parquet
Processed: Label - Label_8, Instance - Instance_699.parquet


                                                                                

Processed: Label - Label_8, Instance - Instance_707.parquet


                                                                                

Processed: Label - Label_8, Instance - Instance_706.parquet


                                                                                

Processed: Label - Label_8, Instance - Instance_703.parquet
Processed: Label - Label_8, Instance - Instance_702.parquet


                                                                                

Processed: Label - Label_8, Instance - Instance_697.parquet


                                                                                

Processed: Label - Label_8, Instance - Instance_710.parquet
Processed: Label - Label_8, Instance - Instance_700.parquet
Processed: Label - Label_8, Instance - Instance_709.parquet
Processed: Label - Label_8, Instance - Instance_708.parquet


Transforming Class Column:  60%|██████    | 6/10 [07:31<03:30, 52.68s/it]       

Processed: Label - Label_8, Instance - Instance_701.parquet
Processed: Label - Label_3, Instance - Instance_1085.parquet
Processed: Label - Label_3, Instance - Instance_1071.parquet
Processed: Label - Label_3, Instance - Instance_1061.parquet
Processed: Label - Label_3, Instance - Instance_1057.parquet
Processed: Label - Label_3, Instance - Instance_1068.parquet
Processed: Label - Label_3, Instance - Instance_1078.parquet
Processed: Label - Label_3, Instance - Instance_1056.parquet
Processed: Label - Label_3, Instance - Instance_1079.parquet
Processed: Label - Label_3, Instance - Instance_1069.parquet
Processed: Label - Label_3, Instance - Instance_1084.parquet
Processed: Label - Label_3, Instance - Instance_1060.parquet
Processed: Label - Label_3, Instance - Instance_1070.parquet
Processed: Label - Label_3, Instance - Instance_1054.parquet
Processed: Label - Label_3, Instance - Instance_1072.parquet
Processed: Label - Label_3, Instance - Instance_1062.parquet
Processed: Label - Label_

Transforming Class Column:  70%|███████   | 7/10 [07:50<02:05, 41.67s/it]

Processed: Label - Label_3, Instance - Instance_1080.parquet
Processed: Label - Label_4, Instance - Instance_776.parquet
Processed: Label - Label_4, Instance - Instance_766.parquet
Processed: Label - Label_4, Instance - Instance_803.parquet
Processed: Label - Label_4, Instance - Instance_813.parquet
Processed: Label - Label_4, Instance - Instance_861.parquet
Processed: Label - Label_4, Instance - Instance_919.parquet
Processed: Label - Label_4, Instance - Instance_871.parquet
Processed: Label - Label_4, Instance - Instance_909.parquet
Processed: Label - Label_4, Instance - Instance_714.parquet
Processed: Label - Label_4, Instance - Instance_944.parquet
Processed: Label - Label_4, Instance - Instance_954.parquet
Processed: Label - Label_4, Instance - Instance_749.parquet
Processed: Label - Label_4, Instance - Instance_759.parquet
Processed: Label - Label_4, Instance - Instance_885.parquet
Processed: Label - Label_4, Instance - Instance_895.parquet
Processed: Label - Label_4, Instance - 

                                                                                

Processed: Label - Label_4, Instance - Instance_760.parquet
Processed: Label - Label_4, Instance - Instance_770.parquet
Processed: Label - Label_4, Instance - Instance_1048.parquet
Processed: Label - Label_4, Instance - Instance_746.parquet
Processed: Label - Label_4, Instance - Instance_756.parquet
Processed: Label - Label_4, Instance - Instance_833.parquet
Processed: Label - Label_4, Instance - Instance_823.parquet
Processed: Label - Label_4, Instance - Instance_929.parquet
Processed: Label - Label_4, Instance - Instance_851.parquet
Processed: Label - Label_4, Instance - Instance_939.parquet
Processed: Label - Label_4, Instance - Instance_841.parquet
Processed: Label - Label_4, Instance - Instance_990.parquet
Processed: Label - Label_4, Instance - Instance_980.parquet
Processed: Label - Label_4, Instance - Instance_724.parquet
Processed: Label - Label_4, Instance - Instance_734.parquet
Processed: Label - Label_4, Instance - Instance_974.parquet
Processed: Label - Label_4, Instance - 

                                                                                

Processed: Label - Label_4, Instance - Instance_976.parquet
Processed: Label - Label_4, Instance - Instance_1053.parquet
Processed: Label - Label_4, Instance - Instance_1043.parquet
Processed: Label - Label_4, Instance - Instance_993.parquet
Processed: Label - Label_4, Instance - Instance_983.parquet
Processed: Label - Label_4, Instance - Instance_727.parquet
Processed: Label - Label_4, Instance - Instance_737.parquet
Processed: Label - Label_4, Instance - Instance_852.parquet
Processed: Label - Label_4, Instance - Instance_842.parquet
Processed: Label - Label_4, Instance - Instance_830.parquet
Processed: Label - Label_4, Instance - Instance_948.parquet
Processed: Label - Label_4, Instance - Instance_820.parquet
Processed: Label - Label_4, Instance - Instance_958.parquet
Processed: Label - Label_4, Instance - Instance_745.parquet
Processed: Label - Label_4, Instance - Instance_755.parquet
Processed: Label - Label_4, Instance - Instance_889.parquet
Processed: Label - Label_4, Instance -

Transforming Class Column:  80%|████████  | 8/10 [11:04<03:00, 90.00s/it]

Processed: Label - Label_4, Instance - Instance_1029.parquet
Processed: Label - Label_5, Instance - Instance_1109.parquet
Processed: Label - Label_5, Instance - Instance_1110.parquet
Processed: Label - Label_5, Instance - Instance_1111.parquet
Processed: Label - Label_5, Instance - Instance_1118.parquet
Processed: Label - Label_5, Instance - Instance_1108.parquet
Processed: Label - Label_5, Instance - Instance_1113.parquet
Processed: Label - Label_5, Instance - Instance_1112.parquet
Processed: Label - Label_5, Instance - Instance_1117.parquet
Processed: Label - Label_5, Instance - Instance_1116.parquet
Processed: Label - Label_5, Instance - Instance_1114.parquet


Transforming Class Column:  90%|█████████ | 9/10 [11:10<01:03, 63.92s/it]

Processed: Label - Label_5, Instance - Instance_1115.parquet
Processed: Label - Label_2, Instance - Instance_1095.parquet
Processed: Label - Label_2, Instance - Instance_1100.parquet


                                                                                

Processed: Label - Label_2, Instance - Instance_1101.parquet
Processed: Label - Label_2, Instance - Instance_1094.parquet
Processed: Label - Label_2, Instance - Instance_1103.parquet
Processed: Label - Label_2, Instance - Instance_1096.parquet
Processed: Label - Label_2, Instance - Instance_1086.parquet
Processed: Label - Label_2, Instance - Instance_1087.parquet
Processed: Label - Label_2, Instance - Instance_1097.parquet
Processed: Label - Label_2, Instance - Instance_1102.parquet
Processed: Label - Label_2, Instance - Instance_1107.parquet
Processed: Label - Label_2, Instance - Instance_1092.parquet
Processed: Label - Label_2, Instance - Instance_1093.parquet
Processed: Label - Label_2, Instance - Instance_1106.parquet
Processed: Label - Label_2, Instance - Instance_1091.parquet
Processed: Label - Label_2, Instance - Instance_1104.parquet
Processed: Label - Label_2, Instance - Instance_1088.parquet
Processed: Label - Label_2, Instance - Instance_1098.parquet
Processed: Label - Label

Transforming Class Column: 100%|██████████| 10/10 [11:24<00:00, 68.45s/it]

Processed: Label - Label_2, Instance - Instance_1090.parquet
Processed data saved to: ../Cleaning & Preparation/Staged Cleaning Data/Stage 2





In [None]:
### Combine Partitioned Data
def combine_partitioned_data_with_aligned_schema(base_path: str, original_df: DataFrame, output_file: str):
    """
    Combines partitioned Parquet files into a single dataset while aligning schemas.

    Parameters:
        base_path (str): Directory containing partitioned Parquet files.
        original_df (DataFrame): Original DataFrame to infer the schema.
        output_file (str): Path to save the combined dataset.

    Returns:
        DataFrame: Combined Spark DataFrame.
    """
    try:
        # Define the full schema (original schema + target column)
        full_schema = original_df.schema
        full_schema_fields = set(col.name for col in full_schema.fields)
        full_schema_fields.add("target")  # Ensure the 'target' column is included

        combined_df = None
        print(f"Traversing base path: {base_path}")
        
        for label_dir in os.listdir(base_path):
            label_path = os.path.join(base_path, label_dir)
            if not os.path.isdir(label_path):
                continue

            for instance_file in os.listdir(label_path):
                instance_path = os.path.join(label_path, instance_file)
                if not instance_path.endswith(".parquet"):
                    continue
                
                try:
                    instance_df = spark.read.parquet(instance_path)
                    print(f"Reading instance file: {instance_path}")
                    
                    # Align schema: Add missing columns with null values
                    instance_fields = set(instance_df.columns)
                    for col_name in full_schema_fields - instance_fields:
                        instance_df = instance_df.withColumn(col_name, lit(None))
                    
                    instance_df = instance_df.select(*full_schema_fields)

                    # Combine with the existing DataFrame
                    if combined_df is None:
                        combined_df = instance_df
                    else:
                        combined_df = combined_df.union(instance_df)

                except Exception as e:
                    print(f"Error reading {instance_path}: {type(e).__name__}: {e}")
                    continue

        # Drop VOID columns
        if combined_df is not None:
            void_columns = [col_name for col_name, dtype in combined_df.dtypes if dtype == "void"]
            if void_columns:
                print(f"Dropping VOID columns: {void_columns}")
                combined_df = combined_df.drop(*void_columns)


            # Save the combined dataset
            print(f"Saving combined dataset to: {output_file}")
            os.makedirs(os.path.dirname(output_file), exist_ok=True)
            combined_df.write.mode("overwrite").parquet(output_file)
            print(f"Combined dataset saved successfully at: {output_file}")

        else:
            print("No data to combine. Ensure the directory structure is correct and contains Parquet files.")

        return combined_df

    except Exception as e:
        print(f"Error combining partitioned data: {type(e).__name__}: {e}")
        return None

# Combine Partitioned Data
combined_df_real = combine_partitioned_data_with_aligned_schema(
    os.path.join(STAGED_DATA_PATH, "Stage 2"),
    spark_df_real,
    os.path.join(STAGED_DATA_PATH, "Stage 3", "real_instances_clean.parquet")
)

Traversing base path: ../Cleaning & Preparation/Staged Cleaning Data/Stage 2
Reading instance file: ../Cleaning & Preparation/Staged Cleaning Data/Stage 2/Label_9/Instance_11.parquet
Reading instance file: ../Cleaning & Preparation/Staged Cleaning Data/Stage 2/Label_9/Instance_5.parquet
Reading instance file: ../Cleaning & Preparation/Staged Cleaning Data/Stage 2/Label_9/Instance_18.parquet
Reading instance file: ../Cleaning & Preparation/Staged Cleaning Data/Stage 2/Label_9/Instance_45.parquet
Reading instance file: ../Cleaning & Preparation/Staged Cleaning Data/Stage 2/Label_9/Instance_55.parquet
Reading instance file: ../Cleaning & Preparation/Staged Cleaning Data/Stage 2/Label_9/Instance_27.parquet
Reading instance file: ../Cleaning & Preparation/Staged Cleaning Data/Stage 2/Label_9/Instance_37.parquet
Reading instance file: ../Cleaning & Preparation/Staged Cleaning Data/Stage 2/Label_9/Instance_19.parquet
Reading instance file: ../Cleaning & Preparation/Staged Cleaning Data/Stage 

24/11/27 02:44:35 WARN DAGScheduler: Broadcasting large task binary with size 19.6 MiB
                                                                                

Combined dataset saved successfully at: ../Cleaning & Preparation/Staged Cleaning Data/Stage 3/real_instances_clean.parquet


In [None]:
# Stage 3 Combined Clean by Instance Dataset Path
df_clean = spark.read.parquet(os.path.join(STAGED_DATA_PATH, "Stage 3", "real_instances_clean.parquet"))

24/11/27 02:58:24 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.


In [None]:
# Add Missingness Indicators
def create_missingness_columns(df, feature_names):
    """
    Adds missingness indicator columns for each feature in the dataset.

    Parameters:
        df (DataFrame): Input Spark DataFrame.
        feature_names (list): List of feature columns to process.

    Returns:
        DataFrame: Spark DataFrame with missingness indicator columns added.
    """
    for feature in feature_names:
        missing_indicator = f"{feature}_missing"
        df = df.withColumn(
            missing_indicator, when(col(feature).isNull(), lit(1)).otherwise(lit(0))
        )
    return df

df_with_missingness = create_missingness_columns(df_clean, feature_names)

In [None]:
# Impute Continuous Features
def impute_continuous_features(df, features_continuous, strategy="mean"):
    """
    Imputes missing values in continuous features using the specified strategy.

    Parameters:
        df (DataFrame): Input Spark DataFrame.
        features_continuous (list): List of continuous feature columns to impute.
        strategy (str): Imputation strategy ("mean" or "median").

    Returns:
        DataFrame: Spark DataFrame with imputed continuous features.
    """
    for feature in features_continuous:
        if strategy == "mean":
            imputed_value = df.select(mean(col(feature))).first()[0]
        elif strategy == "median":
            imputed_value = (
                df.approxQuantile(feature, [0.5], 0.01)[0]
                if df.select(col(feature)).dropna().count() > 0
                else None
            )
        else:
            raise ValueError("Unsupported strategy. Use 'mean' or 'median'.")

        if imputed_value is not None:
            df = df.withColumn(
                feature, when(col(feature).isNull(), lit(imputed_value)).otherwise(col(feature))
            )
    return df

df_imputed_continuous = impute_continuous_features(df_with_missingness, features_continuous, strategy="mean")

                                                                                

In [None]:
# Impute Categorical Features
def impute_categorical_features(df, features_categorical):
    """
    Imputes missing values in categorical features using the mode (most frequent value).

    Parameters:
        df (DataFrame): Input Spark DataFrame.
        features_categorical (list): List of categorical feature columns to impute.

    Returns:
        DataFrame: Spark DataFrame with imputed categorical features.
    """
    for feature in features_categorical:
        mode_value = (
            df.filter(col(feature).isNotNull())
              .groupBy(feature)
              .agg(count("*").alias("freq"))
              .orderBy(col("freq").desc())
              .select(feature)
              .first()[0]
        )
        if mode_value is not None:
            df = df.withColumn(
                feature, when(col(feature).isNull(), lit(mode_value)).otherwise(col(feature))
            )
    return df

df_clean = impute_categorical_features(df_imputed_continuous, features_categorical)

                                                                                

In [None]:
# Save Cleaned Dataset
cleaned_dataset_path = os.path.join(CLEANED_DATA_PATH, "cleaned_data.parquet")
df_clean.write.parquet(cleaned_dataset_path, mode="overwrite")

                                                                                

# Split Data

In [None]:
### Data Splitting into Train and Test Sets
def split_data(df, train_ratio=0.8):
    """
    Splits the DataFrame into train and test sets based on 'Instance', ensuring no overlap.
    """
    # Get distinct instances
    instances_df = df.select("Instance", "label").distinct()
    
    # Assign a random number to each instance
    instances_df = instances_df.withColumn("rand", rand(seed=42))
    
    # Handle labels with low number of instances
    window = Window.partitionBy("label").orderBy("rand")
    instances_df = instances_df.withColumn("row_number", row_number().over(window))
    instances_df = instances_df.withColumn("total_instances", F.count("*").over(Window.partitionBy("label")))
    
    # Define train and test indices
    instances_df = instances_df.withColumn(
        "split",
        when(
            (col("total_instances") > 4) & (col("row_number") <= col("total_instances") * train_ratio),
            "train"
        ).when(
            (col("total_instances") > 4),
            "test"
        ).when(
            (col("total_instances") <= 4) & (col("row_number") < col("total_instances")),
            "train"
        ).otherwise("test")
    )
    
    # Extract train and test instances
    train_instances = instances_df.filter(col("split") == "train").select("Instance")
    test_instances = instances_df.filter(col("split") == "test").select("Instance")
    
    # Join back to the main DataFrame
    train_df = df.join(train_instances, on="Instance", how="inner")
    test_df = df.join(test_instances, on="Instance", how="inner")
    
    return train_df, test_df

df_clean = spark.read.parquet(os.path.join(CLEANED_DATA_PATH, "cleaned_data.parquet"))
train_df, test_df = split_data(df_clean)

In [None]:
# Save Train and Test Data
train_data_path = os.path.join(TRAIN_TEST_DATA_PATH, "train_data.parquet")
test_data_path = os.path.join(TRAIN_TEST_DATA_PATH, "test_data.parquet")

train_df.write.parquet(train_data_path, mode="overwrite")
test_df.write.parquet(test_data_path, mode="overwrite")

24/11/27 08:03:42 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [None]:
### Verify Splits
def verify_splits(train_df, test_df):
    # Ensure no overlap in Instances
    train_instances = train_df.select("Instance").distinct()
    test_instances = test_df.select("Instance").distinct()
    overlapping_instances = train_instances.intersect(test_instances)
    overlap_count = overlapping_instances.count()
    if overlap_count > 0:
        print(f"Overlap detected in Instances: {overlap_count}")
    else:
        print("No overlap in Instances between train and test sets.")

    # Show counts per label
    print("Train set counts per label:")
    train_df.groupBy("label").agg(countDistinct("Instance").alias("instance_count")).show()
    print("Test set counts per label:")
    test_df.groupBy("label").agg(countDistinct("Instance").alias("instance_count")).show()

train_df = spark.read.parquet(train_data_path)
test_df = spark.read.parquet(test_data_path)
verify_splits(train_df, test_df)

                                                                                

No overlap in Instances between train and test sets.
Train set counts per label:
+-----+--------------+
|label|instance_count|
+-----+--------------+
|    9|            45|
|    3|            25|
|    4|           274|
|    6|             4|
|    2|            17|
|    7|            28|
|    8|            11|
|    5|             8|
|    0|           475|
|    1|             3|
+-----+--------------+

Test set counts per label:
+-----+--------------+
|label|instance_count|
+-----+--------------+
|    9|            12|
|    3|             7|
|    4|            69|
|    6|             2|
|    2|             5|
|    7|             8|
|    5|             3|
|    8|             3|
|    0|           119|
|    1|             1|
+-----+--------------+



# Scale Features

In [None]:
### Apply Standard Scaler to Train and Test Data
def scale_features(train_df, test_df, feature_names):
    # Assemble features
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features_vector", handleInvalid="skip")
    
    # Fit scaler on training data
    train_vectorized = assembler.transform(train_df)
    scaler = StandardScaler(inputCol="features_vector", outputCol="scaled_features", withStd=True, withMean=True)
    scaler_model = scaler.fit(train_vectorized)
    train_scaled = scaler_model.transform(train_vectorized)
    
    # Apply scaler to test data
    test_vectorized = assembler.transform(test_df)
    test_scaled = scaler_model.transform(test_vectorized)
    
    # Convert scaled features to array and update feature columns
    def extract_scaled_features(df):
        df = df.withColumn("scaled_features_array", vector_to_array(col("scaled_features")))
        for idx, feature in enumerate(feature_names):
            df = df.withColumn(feature, col("scaled_features_array")[idx])
        return df
    
    train_scaled = extract_scaled_features(train_scaled)
    test_scaled = extract_scaled_features(test_scaled)
    
    # Select relevant columns
    columns_to_keep = [col_name for col_name in df_clean.columns if col_name not in feature_names] + feature_names
    train_final_scaled = train_scaled.select(*columns_to_keep)
    test_final_scaled = test_scaled.select(*columns_to_keep)
    
    return train_final_scaled, test_final_scaled, scaler_model

train_final_scaled, test_final_scaled, scaler_model = scale_features(train_df, test_df, feature_names)

                                                                                

In [None]:
# save the scaler model
scaler_model_path = os.path.join(BASE_DIR, "scaler_model")
scaler_model.write().overwrite().save(scaler_model_path)

In [None]:
### Save Scaled Data
scaled_train_path = os.path.join(SCALED_DATA_PATH, "scaled_train_data.parquet")
scaled_test_path = os.path.join(SCALED_DATA_PATH, "scaled_test_data.parquet")

train_final_scaled.write.mode("overwrite").parquet(scaled_train_path)
test_final_scaled.write.mode("overwrite").parquet(scaled_test_path)

                                                                                

In [None]:
### End Spark Session
spark.stop()