# Scania Failure Model - Data Cleaning & Preprocessing Pipeline

## 0. Imports and configuration 
Importing necessary libraries for data manipulation and pipelines.

In [105]:
import sys
import os
import pandas as pd

In [106]:
from src.data.preprocess import (
    load_aps_data, 
    encode_target, 
    get_cols_with_missing_threshold, 
    get_imputation_pipeline,
    get_scaling_pipeline,
    check_split_integrity
)
from src.feature_engineering import (
    drop_high_correlated_features, 
    drop_low_variance_features
)

# Relative paths of datasets
TRAIN_PATH = 'data/raw/aps_failure_training_set.csv'
TEST_PATH = 'data/raw/aps_failure_test_set.csv'

## 1. Data Loading and Manipulation

In [107]:
# Load datasets
df_train = load_aps_data(TRAIN_PATH)
df_test = load_aps_data(TEST_PATH)

# Define target value and seperate features (X) from target (Y) 
target_col = 'class'

X_train_raw = df_train.drop(columns=[target_col])
y_train_raw = df_train[target_col]

X_test_raw = df_test.drop(columns=[target_col])
y_test_raw = df_test[target_col]

# Verify raw data dimensions
print(f"Wymiary surowe Train: {X_train_raw.shape}")
print(f"Wymiary surowe Test:  {X_test_raw.shape}")

Wymiary surowe Train: (60000, 170)
Wymiary surowe Test:  (16000, 170)


In [108]:
# Mapping the text 'neg'/'pos' to 0/1
y_train = encode_target(y_train_raw)
y_test = encode_target(y_test_raw)

# Print info
print("Target zakodowany (0/1).")
print(y_train.value_counts())

Target zakodowany (0/1).
class
0    59000
1     1000
Name: count, dtype: int64


# 2. Handling missing values 
Dropping columns with more than 50% of missing values, imputating values using median strategy.

In [109]:
# Decide which columns to drop from training dataset
cols_to_drop = get_cols_with_missing_threshold(X_train_raw, threshold=0.5)

# Print info
print(f"Liczba kolumn do usunięcia (>50% NaN): {len(cols_to_drop)}")
print(f"Przykładowe usuwane kolumny: {cols_to_drop[:10]}")

# Drop columns from training dataset
X_train_dropped = X_train_raw.drop(columns=cols_to_drop)
# The model should have the same inputs, so the columns from test dataset must also be dropped. 
X_test_dropped = X_test_raw.drop(columns=cols_to_drop)

print(f"Wymiary po usunięciu kolumn: {X_train_dropped.shape}")

Liczba kolumn do usunięcia (>50% NaN): 8
Przykładowe usuwane kolumny: ['ab_000', 'bm_000', 'bn_000', 'bo_000', 'bp_000', 'bq_000', 'br_000', 'cr_000']
Wymiary po usunięciu kolumn: (60000, 162)


In [110]:
# Initalize the imputation pipeline
imputer_pipeline = get_imputation_pipeline(strategy='median')

# Fit the imputer on training data to calculate the median for each feature
imputer_pipeline.fit(X_train_dropped)

# Fill missing values in both datasets using the medians
X_train_imputed = imputer_pipeline.transform(X_train_dropped)
X_test_imputed = imputer_pipeline.transform(X_test_dropped)

# Reconstruct DataFrames to restore column names lost during transformation
X_train_to_filter = pd.DataFrame(X_train_imputed, columns=X_train_dropped.columns)
X_test_to_filter = pd.DataFrame(X_test_imputed, columns=X_test_dropped.columns)


## 3. Feature Selection and Dimensionality Reduction

### Removing low-variance, highly correlated features

In [120]:
# Remove low variance features
X_train_filtered=drop_low_variance_features(X_train_to_filter, threshold=0.01)
X_test_filtered=X_test_to_filter[X_train_filtered.columns]
# Remove highly correlated features 
X_train_reduced=drop_high_correlated_features(X_train_filtered, threshold=0.9)
X_test_reduced=X_test_filtered[X_train_reduced.columns]

# Print info
print(f"Kształt przed wyborem: {X_train_to_filter.shape[1]}")
print(f"Kształt po wyborze: {X_train_reduced.shape[1]}")

Kształt przed wyborem: 162
Kształt po wyborze: 123


### PCA

In [121]:
# Initialize the standarization pipeline
scaler_pipeline = get_scaling_pipeline()

# Fit the scaler on training data. This must be performed after handling missing values.
scaler_pipeline.fit(X_train_imputed) 

# Transform both training and test sets 
X_train_scaled = scaler_pipeline.transform(X_train_imputed)
X_test_scaled = scaler_pipeline.transform(X_test_imputed)

# Reconstruct DataFrames to restore column names lost during transformation
X_train_final = pd.DataFrame(X_train_scaled, columns=X_train_dropped.columns)
X_test_final = pd.DataFrame(X_test_scaled, columns=X_test_dropped.columns)


# Validation: Verify that scaling resulted in mean ~ 0, std ~ 1. Print info
for i in range(X_train_final.shape[1]):
    if abs(X_train_final.iloc[:, i].mean()) > 0.0001 or abs(X_train_final.iloc[:, i].std() - 1) > 0.0001:
        print(f"    UWAGA: w {X_train_final.columns[i]}")
        print(f"    Średnia wyszła: {X_train_final.iloc[:, i].mean():.6f} (oczekiwano ~0)")
        print(f"    Odchylenie wyszło: {X_train_final.iloc[:, i].std():.6f} (oczekiwano ~1)")
    else:
        continue

    UWAGA: w cd_000
    Średnia wyszła: 0.000000 (oczekiwano ~0)
    Odchylenie wyszło: 0.000000 (oczekiwano ~1)


In [114]:
# Check train/test split integrality
check_split_integrity(X_train_final, X_test_final, y_train, y_test)

Kształt X_train: (60000, 162)
Kształt X_test:  (16000, 162)
Kolumny w obu zbiorach są zgodne.
Procent klasy pozytywnej (awaria) w Train: 1.67%
Procent klasy pozytywnej (awaria) w Test:  2.34%


## Saving datasets

In [115]:
# os.makedirs('data/processed', exist_ok=True)

# # Saving training dataset
# X_train_final.to_csv('data/processed/X_train.csv', index=False)
# y_train.to_csv('data/processed/y_train.csv', index=False)

# # Saving test dataset
# X_test_final.to_csv('data/processed/X_test.csv', index=False)
# y_test.to_csv('data/processed/y_test.csv', index=False)