In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [21]:

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from boruta import BorutaPy

from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

## Data Loading


In [22]:
X = pd.read_csv('../source_data/secom/secom.data', sep=' ', header=None)
feature_names = [f'feature{i+1}' for i in range(X.shape[1])]
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,580,581,582,583,584,585,586,587,588,589
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.363,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


In [23]:
X_read = pd.read_csv('../source_data/secom/secom.data', sep='\\s+', header=None)
X_read.columns = [f'feature{i+1}' for i in range(X_read.shape[1])]

In [24]:
X.shape

(1567, 590)

In [28]:
y_read = pd.read_csv('../source_data/secom/secom_labels.data', sep='\\s+', header=None)
y_read.columns = ['label', 'date_time']
y_read.head()

Unnamed: 0,label,date_time
0,-1,19/07/2008 11:55:00
1,-1,19/07/2008 12:32:00
2,1,19/07/2008 13:17:00
3,-1,19/07/2008 14:43:00
4,-1,19/07/2008 15:22:00


In [29]:
y_read.shape

(1567, 2)

In [27]:
df = pd.concat([
    X,               
    y_read[['label']]     
], axis=1)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,581,582,583,584,585,586,587,588,589,label
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,0.5005,0.0118,0.0035,2.363,,,,,0
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045,0
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,0
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,0


## Train-Test Split

In [30]:
# 80% train 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X_read, y_read['label'], test_size=0.2, stratify=y_read['label'], random_state=42
)

## Data Pre Processing


In [32]:
# Fix label encoding
y_train = y_train.replace({-1: 0, 1: 1})
y_test = y_test.replace({-1: 0, 1: 1})

In [33]:
def drop_zero_variance_columns(df):
    return df.loc[:, df.var() != 0]

def drop_feature_with_missing_values(df, threshold=0.45):
    missing_percentage = df.isnull().mean()
    to_drop = missing_percentage[missing_percentage > threshold].index
    return df.drop(columns=to_drop)

def cap_with_3s(df, threshold=3):
    return df.apply(lambda x: np.clip(x, x.mean() - threshold * x.std(), x.mean() + threshold * x.std()))

def impute_missing_values(df, strategy='median'):
    if strategy == 'median':
        imputer = SimpleImputer(strategy='median')
    elif strategy == 'knn':
        imputer = KNNImputer()
    elif strategy == 'mice':
        imputer = IterativeImputer()
    else:
        raise ValueError("Invalid imputation strategy.")
    return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

def scale_features(df, scaler='minmax'):
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    elif scaler == 'robust':
        scaler = RobustScaler()
    else:
        raise ValueError("Invalid scaler.")
    return pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

def feature_selection(X, y, method='boruta', random_state=42):
    if method == 'boruta':
        rf = RandomForestClassifier(n_estimators=100, random_state=random_state)
        boruta = BorutaPy(estimator=rf, n_estimators='auto', random_state=random_state)
        boruta.fit(X.values, y.values)
        return X.loc[:, boruta.support_]
    
    elif method == 'rfe':
        estimator = LogisticRegression(max_iter=1000, random_state=random_state)
        n_features_to_select = min(40, X.shape[1]//2)
        rfe = RFE(estimator=estimator, n_features_to_select=n_features_to_select)
        rfe.fit(X, y)
        return X.loc[:, rfe.support_]
    
    elif method == 'lasso':
        lasso = LassoCV(cv=5, random_state=random_state)
        lasso.fit(X, y)
        selected_features = X.columns[(lasso.coef_ != 0)]
        return X[selected_features]
    
    else:
        raise ValueError("Invalid feature selection method. Choose from 'boruta', 'rfe', or 'lasso'.")

def oversampling(X, y, strategy='smote'):
    if strategy == 'smote':
        sampler = SMOTE()
    elif strategy == 'adasyn':
        sampler = ADASYN()
    elif strategy == 'rose':
        sampler = RandomOverSampler()
    else:
        raise ValueError("Invalid oversampling strategy.")
    return sampler.fit_resample(X, y)

In [34]:
# Define parameters
imputation_strategies = ['median', 'knn', 'mice']
feature_selection_methods = ['rfe', 'boruta', 'lasso']
oversampling_strategies = ['smote', 'adasyn', 'rose']
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'RandomForest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'SVM': SVC(class_weight='balanced', probability=True, random_state=42),
    'NaiveBayes': GaussianNB(),
    'LightGBM': LGBMClassifier(class_weight='balanced', random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = []

for impute_method in imputation_strategies:
    for fs_method in feature_selection_methods:
        for oversample_method in oversampling_strategies:
            for model_name, model in models.items():
                start_time = time.time()
                # Preprocessing pipeline
                X_train_p = drop_zero_variance_columns(X_train)
                X_train_p = drop_feature_with_missing_values(X_train_p)
                X_test_p = X_test[X_train_p.columns]

                # Imputation
                X_train_p = impute_missing_values(X_train_p, strategy=impute_method)
                X_test_p = impute_missing_values(X_test_p, strategy=impute_method)

                # Capping
                X_train_p = cap_with_3s(X_train_p)
                X_test_p = cap_with_3s(X_test_p)

                # Feature selection
                try:
                    X_train_sel = feature_selection(X_train_p, y_train, method=fs_method)
                except Exception as e:
                    print(f"Feature selection failed for method {fs_method}: {e}")
                    continue

                X_test_sel = X_test_p[X_train_sel.columns]

                # Oversampling
                try:
                    X_res, y_res = oversampling(X_train_sel, y_train, strategy=oversample_method)
                except Exception as e:
                    print(f"Oversampling failed for method {oversample_method}: {e}")
                    continue

                # Train model
                try:
                    model.fit(X_res, y_res)
                except Exception as e:
                    print(f"Model training failed for {model_name}: {e}")
                    continue

                # Predict & evaluate
                preds = model.predict(X_test_sel)

                f1 = f1_score(y_test, preds, pos_label=1, zero_division=0)
                precision = precision_score(y_test, preds, pos_label=1, zero_division=0)
                recall = recall_score(y_test, preds, pos_label=1, zero_division=0)
                exec_time = time.time() - start_time

                pipeline_desc = f"{impute_method}_{fs_method}_{oversample_method}_{model_name}"

                results.append({
                    'Pipeline': pipeline_desc,
                    'F1': f1,
                    'Precision': precision,
                    'Recall': recall,
                    'Num Features Selected': X_train_sel.shape[1],
                    'Execution Time (s)': exec_time
                })
                print(f"Completed: {pipeline_desc} | F1: {f1:.4f} | Precision: {precision:.4f} | Time: {exec_time:.2f}s")

# Create results DataFrame
results_df = pd.DataFrame(results)

KeyboardInterrupt: 

In [None]:
# Top 10 by F1
top_f1 = results_df.sort_values(by='F1', ascending=False).head(10).reset_index(drop=True)
print("Top 10 Pipelines by F1 Score:")
print(top_f1)


In [None]:
# Top 10 by Precision
top_precision = results_df.sort_values(by='Precision', ascending=False).head(10).reset_index(drop=True)
print("\\nTop 10 Pipelines by Precision Score:")
print(top_precision)