In [None]:
import pandas as pd

# 1.1 Load training dataset
training_set = pd.read_csv("DIA_trainingset_RDKit_descriptors.csv")
training_set

In [None]:
Xtrain = training_set.iloc[:, 2:]
Ytrain = training_set.iloc[:, 0]

# Display label distribution
print(Ytrain.value_counts())

In [None]:
# 1.2 Load test dataset
test_set = pd.read_csv("DIA_testset_RDKit_descriptors.csv")
test_set

In [5]:
Xtest = test_set.iloc[:, 2:]
Ytest = test_set.iloc[:, 0]

# Display label distribution
print(Ytest.value_counts())

Label
0    90
1    30
Name: count, dtype: int64


In [6]:
# 2. Feature preprocessing pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import numpy as np
import joblib

def preprocess_features(X_train, X_test):
    """
    Preprocess features: standardization, variance threshold, correlation filtering
    
    Args:
        X_train (pd.DataFrame): Training features
        X_test (pd.DataFrame): Test features
        
    Returns:
        X_train_processed, X_test_processed (pd.DataFrame)
    """
    # 2.1. Check missing values
    print("\nMissing values:")
    print("Training set:", X_train.isnull().sum().sum())
    print("Test set:", X_test.isnull().sum().sum())
    
    # 2.2. Standardization
    scaler = StandardScaler()
    X_train_std = pd.DataFrame(
        scaler.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )
    X_test_std = pd.DataFrame(
        scaler.transform(X_test),
        columns=X_test.columns,
        index=X_test.index
    )
    joblib.dump(scaler, 'scaler.pkl')  # Save the scaler for reproducibility
    
    # 2.3. Remove zero-variance features
    selector = VarianceThreshold()
    selector.fit(X_train_std)
    keep_vars = X_train.columns[selector.variances_ != 0].tolist()
    
    X_train_var = X_train_std[keep_vars]
    X_test_var = X_test_std[keep_vars]
    
    # 2.4. Remove highly correlated features
    corr_matrix = X_train_var.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    drop_features = [column for column in upper.columns if any(upper[column] > 0.9)]
    
    X_train_processed = X_train_var.drop(columns=drop_features)
    X_test_processed = X_test_var.drop(columns=drop_features)
    
    print(f"\nFeatures reduced from {X_train.shape[1]} to {X_train_processed.shape[1]}")
    
    # Save processed features
    X_train_processed.to_csv("X_train_processed2.csv", index=False)
    X_test_processed.to_csv("X_test_processed2.csv", index=False)
    
    return X_train_processed, X_test_processed


# Preprocess features
X_train_processed, X_test_processed = preprocess_features(Xtrain, Xtest)


Missing values:
Training set: 0
Test set: 0

Features reduced from 196 to 140
