# Scania Failure Model - Data Cleaning & Preprocessing Pipeline

## 0. Imports and configuration 
Importing necessary libraries for data manipulation and pipelines.

In [3]:
import sys
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [4]:
from src.data.preprocess import (
    load_aps_data, 
    encode_target, 
    get_cols_with_missing_threshold, 
    get_imputation_pipeline,
    get_scaling_pipeline,
    check_split_integrity
)
from src.feature_engineering import (
    drop_high_correlated_features, 
    drop_low_variance_features,
    select_features_lasso,
    apply_pca,
    select_features_mutual_info,
    select_features_rfe,
    benchmark_selection
)

# Relative paths of datasets
TRAIN_PATH = 'data/raw/aps_failure_training_set.csv'
TEST_PATH = 'data/raw/aps_failure_test_set.csv'

## 1. Data Loading and Manipulation

In [5]:
# Load datasets
df_train = load_aps_data(TRAIN_PATH)
df_test = load_aps_data(TEST_PATH)

# Define target value and seperate features (X) from target (Y) 
target_col = 'class'

X_train_raw = df_train.drop(columns=[target_col])
y_train_raw = df_train[target_col]

X_test_raw = df_test.drop(columns=[target_col])
y_test_raw = df_test[target_col]

# Verify raw data dimensions
print(f"Wymiary surowe Train: {X_train_raw.shape}")
print(f"Wymiary surowe Test:  {X_test_raw.shape}")

Wymiary surowe Train: (60000, 170)
Wymiary surowe Test:  (16000, 170)


In [6]:
# Mapping the text 'neg'/'pos' to 0/1
y_train = encode_target(y_train_raw)
y_test = encode_target(y_test_raw)

# Print info
print("Target zakodowany (0/1).")
print(y_train.value_counts())

Target zakodowany (0/1).
class
0    59000
1     1000
Name: count, dtype: int64


# 2. Handling missing values 
Dropping columns with more than 50% of missing values, imputating values using median strategy.

In [7]:
# Decide which columns to drop from training dataset
cols_to_drop = get_cols_with_missing_threshold(X_train_raw, threshold=0.5)

# Print info
print(f"Liczba kolumn do usunięcia (>50% NaN): {len(cols_to_drop)}")
print(f"Przykładowe usuwane kolumny: {cols_to_drop[:10]}")

# Drop columns from training dataset
X_train_dropped = X_train_raw.drop(columns=cols_to_drop)
# The model should have the same inputs, so the columns from test dataset must also be dropped. 
X_test_dropped = X_test_raw.drop(columns=cols_to_drop)

print(f"Wymiary po usunięciu kolumn: {X_train_dropped.shape}")

Liczba kolumn do usunięcia (>50% NaN): 8
Przykładowe usuwane kolumny: ['ab_000', 'bm_000', 'bn_000', 'bo_000', 'bp_000', 'bq_000', 'br_000', 'cr_000']
Wymiary po usunięciu kolumn: (60000, 162)


In [8]:
# Initalize the imputation pipeline
imputer_pipeline = get_imputation_pipeline(strategy='median')

# Fit the imputer on training data to calculate the median for each feature
imputer_pipeline.fit(X_train_dropped)

# Fill missing values in both datasets using the medians
X_train_imputed = imputer_pipeline.transform(X_train_dropped)
X_test_imputed = imputer_pipeline.transform(X_test_dropped)

# Reconstruct DataFrames to restore column names lost during transformation
X_train_to_filter = pd.DataFrame(X_train_imputed, columns=X_train_dropped.columns)
X_test_to_filter = pd.DataFrame(X_test_imputed, columns=X_test_dropped.columns)


## 3. Feature Selection and Dimensionality Reduction

### Removing low-variance, highly correlated features

In [9]:
# Remove low variance features
X_train_filtered=drop_low_variance_features(X_train_to_filter, threshold=0.01)
X_test_filtered=X_test_to_filter[X_train_filtered.columns]
# Remove highly correlated features 
X_train_reduced=drop_high_correlated_features(X_train_filtered, threshold=0.9)
X_test_reduced=X_test_filtered[X_train_reduced.columns]

# Print info
print(f"Kształt przed wyborem: {X_train_to_filter.shape[1]}")
print(f"Kształt po wyborze: {X_train_reduced.shape[1]}")

Kształt przed wyborem: 162
Kształt po wyborze: 123


### Data standarization

In [10]:
# Initialize the standarization pipeline
scaler_pipeline = get_scaling_pipeline()

# Fit the scaler on training data. This must be performed after handling missing values.
scaler_pipeline.fit(X_train_reduced) 

# Transform both training and test sets 
X_train_scaled = scaler_pipeline.transform(X_train_reduced)
X_test_scaled = scaler_pipeline.transform(X_test_reduced)

# Reconstruct DataFrames to restore column names lost during transformation
X_train_final = pd.DataFrame(X_train_scaled, columns=X_train_reduced.columns)
X_test_final = pd.DataFrame(X_test_scaled, columns=X_test_reduced.columns)


# Validation: Verify that scaling resulted in mean ~ 0, std ~ 1. Print info
for i in range(X_train_final.shape[1]):
    if abs(X_train_final.iloc[:, i].mean()) > 0.0001 or abs(X_train_final.iloc[:, i].std() - 1) > 0.0001:
        print(f"    UWAGA: w {X_train_final.columns[i]}")
        print(f"    Średnia wyszła: {X_train_final.iloc[:, i].mean():.6f} (oczekiwano ~0)")
        print(f"    Odchylenie wyszło: {X_train_final.iloc[:, i].std():.6f} (oczekiwano ~1)")
        error_found=True
    else:
        error_found=False
        continue
if not error_found:
    print("Standaryzacja przebiegła pomyślnie - wszystkie features mają średnią bliską 0 i odchylenie standardowe bliskie 1.")

Standaryzacja przebiegła pomyślnie - wszystkie features mają średnią bliską 0 i odchylenie standardowe bliskie 1.


### Feature selection methods 
Zastosowałam po jednej z trzech typów metod - filter (mutual information), wrapper (RFE) i embedded (L1 regularization - Lasso). Tworzę podstawowy model, aby sprawdzić, która z metod sprawdza się najlepiej.

Uzyskaliśmy następujące wyniki - 

|Method | Feature Count | F1 Score (Mean) | F1 Score (Std) |
| ---  | --- | --- | --- | 
|Lasso Selection (embedded technique) |48|0.80019| 0.014573 |
|RFE (wrapped technique)    |         50     |    0.793696   | 0.012675  |
|All Scaled Features        |    123    |     0.782085   | 0.009122|
|Mutual Information (filter technique)        |     50   |      0.763118    |  0.010872|

Z tego powodu Lasso zostało wybrane jako Feature Selection Method.





In [11]:
# Feature Selection Method: L1 Regularization (Lasso)
X_train_lasso=select_features_lasso(X_train_final, y_train)
X_test_lasso = X_test_final[X_train_lasso.columns]


In [12]:
# # Uncomment for benckmark - you will have to wait about 7 minutes

# X_train_RFE=select_features_rfe(X_train_final, y_train)


In [13]:
# X_train_mi=select_features_mutual_info(X_train_final, y_train)


In [14]:
## TO USE BENCHMARK UNCOMMENT TWO PREVIOUS CELLS 
# # Define baseline model
# base_rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# # Run benchmarks for different models
# results = []
# results.append(benchmark_selection(base_rf, X_train_final, y_train, "All Scaled Features"))
# results.append(benchmark_selection(base_rf, X_train_mi, y_train, "Mutual Information (filter technique)"))
# results.append(benchmark_selection(base_rf, X_train_RFE, y_train, "RFE (wrapped technique)"))
# results.append(benchmark_selection(base_rf, X_train_lasso, y_train, "Lasso Selection (embedded technique)"))

# # Comparison table
# comparison_df = pd.DataFrame(results).sort_values(by="F1 Score (Mean)", ascending=False)
# print(comparison_df)

### PCA

In [15]:
# Apply PCA to the Lasso selected features
# We retain 95% of the variance
X_train_final_pca, X_test_final_pca, pca_model = apply_pca(
    X_train_lasso, 
    X_test_lasso, 
    n_components=0.95
)

# Print info
print(f"Features after Lasso: {X_train_lasso.shape[1]}")
print(f"Features after PCA:   {X_train_final_pca.shape[1]}")
print(f"Explained variance:   {pca_model.explained_variance_ratio_.sum():.2%}")


Features after Lasso: 48
Features after PCA:   36
Explained variance:   95.74%


In [16]:
check_split_integrity(X_train_final_pca, X_test_final_pca, y_train, y_test)

Kształt X_train: (60000, 36)
Kształt X_test:  (16000, 36)
Kolumny w obu zbiorach są zgodne.
Procent klasy pozytywnej (awaria) w Train: 1.67%
Procent klasy pozytywnej (awaria) w Test:  2.34%


## 4. Saving datasets

In [None]:
os.makedirs('data/processed', exist_ok=True)

# Saving training dataset
X_train_final_pca.to_csv('data/processed/X_train.csv', index=False)
X_test_final_pca.to_csv('data/processed/X_test.csv', index=False)

# Saving test dataset
y_train.to_csv('data/processed/y_train.csv', index=False)
y_test.to_csv('data/processed/y_test.csv', index=False)