<div style="text-align: center; margin: 30px 0;">
  <span style="
    font-family: 'Playfair Display', serif;
    background-color:rgb(242, 64, 10);
    color: white;
    font-size: 1.5em;
    font-weight: 600;
    font-style: italic;
    padding: 12px 24px;
    border-radius: 12px;
    display: inline-block;
    box-shadow: 0 4px 12px rgba(18, 17, 17, 0.15);
    transition: transform 0.2s ease, box-shadow 0.2s ease;
    cursor: default;
  " 
  onmouseover="this.style.transform='scale(1.03)'; this.style.boxShadow='0 6px 16px rgba(0,0,0,0.2)'"
  onmouseout="this.style.transform='scale(1)'; this.style.boxShadow='0 4px 12px rgba(0,0,0,0.15)'"
  >
   Feature Engineering 
  </span>
</div>
<a id="import-data"></a>


In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy  as sc
import math
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [45]:
df = pd.read_csv(r"../data/cleaned_train.csv")

In [46]:
X_train = df.drop(columns=['Class'])
y_train = df['Class']

In [47]:
# =============================================================================
# ANOMALY DETECTION FEATURES
# =============================================================================

In [48]:
def fit_anomaly_detector(X_train, contamination=0.1, random_state=42):
    """
    Fits anomaly detection model on training data.
    
    Args:
        X_train: Training data (pandas DataFrame)
        contamination: Expected proportion of anomalies (default: 0.1)
        random_state: Random seed for reproducibility
    
    Returns:
        dict: Fitted model and parameters
    """
    numerical_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    
    try:
        iso_forest = IsolationForest(
            contamination=contamination, 
            random_state=random_state, 
            n_jobs=-1
        )
        iso_forest.fit(X_train[numerical_cols])
        
        return {
            'model': iso_forest,
            'numerical_cols': numerical_cols
        }
    
    except Exception as e:
        print(f"Warning in anomaly detector fitting: {e}")
        return {
            'model': None,
            'numerical_cols': numerical_cols
        }



In [49]:
def create_anomaly_features(X, anomaly_model_dict):
    """
    Creates anomaly detection features using fitted Isolation Forest.
    
    Features generated:
    - isolation_forest_score: Anomaly score (lower = more anomalous)
    - is_anomaly: Binary flag (1 = anomaly, 0 = normal)
    
    Args:
        X: Input data (pandas DataFrame)
        anomaly_model_dict: Dictionary from fit_anomaly_detector()
    
    Returns:
        pandas DataFrame: Anomaly features
    """
    features = pd.DataFrame(index=X.index)
    
    try:
        if anomaly_model_dict['model'] is not None:
            numerical_cols = anomaly_model_dict['numerical_cols']
            model = anomaly_model_dict['model']
            
            scores = model.decision_function(X[numerical_cols])
            predictions = model.predict(X[numerical_cols])
            
            features['isolation_forest_score'] = scores
            features['is_anomaly'] = (predictions == -1).astype(int)
        
    except Exception as e:
        print(f"Warning in anomaly feature creation: {e}")
    
    return features


In [50]:

# =============================================================================
# CLUSTERING FEATURES  
# =============================================================================

In [51]:
def fit_clustering_model(X_train, n_clusters=8, random_state=42):
    """
    Fits clustering model on training data.
    
    Args:
        X_train: Training data (pandas DataFrame)
        n_clusters: Number of clusters (default: 8)
        random_state: Random seed for reproducibility
    
    Returns:
        dict: Fitted models and parameters
    """
    numerical_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    
    try:
        scaler = StandardScaler()
        scaled_train = scaler.fit_transform(X_train[numerical_cols])
        
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
        kmeans.fit(scaled_train)
        
        return {
            'scaler': scaler,
            'kmeans': kmeans,
            'numerical_cols': numerical_cols,
            'n_clusters': n_clusters
        }
    
    except Exception as e:
        print(f"Warning in clustering model fitting: {e}")
        return {
            'scaler': None,
            'kmeans': None,
            'numerical_cols': numerical_cols,
            'n_clusters': n_clusters
        }


In [52]:
def create_clustering_features(X, clustering_model_dict):
    """
    Creates clustering-based features using fitted K-Means.
    
    Features generated:
    - cluster_id: Assigned cluster ID (0 to n_clusters-1)
    - dist_to_cluster_i: Distance to each cluster center
    - min_cluster_distance: Distance to nearest cluster center
    
    Args:
        X: Input data (pandas DataFrame)
        clustering_model_dict: Dictionary from fit_clustering_model()
    
    Returns:
        pandas DataFrame: Clustering features
    """
    features = pd.DataFrame(index=X.index)
    
    try:
        if clustering_model_dict['scaler'] is not None and clustering_model_dict['kmeans'] is not None:
            numerical_cols = clustering_model_dict['numerical_cols']
            scaler = clustering_model_dict['scaler']
            kmeans = clustering_model_dict['kmeans']
            n_clusters = clustering_model_dict['n_clusters']
            
            scaled_data = scaler.transform(X[numerical_cols])
            labels = kmeans.predict(scaled_data)
            distances = cdist(scaled_data, kmeans.cluster_centers_)
            
            features['cluster_id'] = labels
            
            for i in range(n_clusters):
                features[f'dist_to_cluster_{i}'] = distances[:, i]
            
            features['min_cluster_distance'] = np.min(distances, axis=1)
        
    except Exception as e:
        print(f"Warning in clustering feature creation: {e}")
    
    return features

In [53]:
# =============================================================================
# STATISTICAL FEATURES
# =============================================================================

In [54]:
def create_statistical_features(X):
    """
    Creates statistical features computed across numerical columns for each row.
    
    Features generated:
    - row_mean: Mean value across numerical columns
    - row_std: Standard deviation across numerical columns
    - row_min: Minimum value across numerical columns
    - row_max: Maximum value across numerical columns
    - row_median: Median value across numerical columns
    - row_cv: Coefficient of variation (std/mean)
    - row_skew: Skewness across numerical columns
    - row_kurtosis: Kurtosis across numerical columns
    
    Args:
        X: Input data (pandas DataFrame)
    
    Returns:
        pandas DataFrame: Statistical features
    """
    numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    features = pd.DataFrame(index=X.index)
    
    try:
        data_matrix = X[numerical_cols].values
        
        # Basic row-wise statistics
        features['row_mean'] = np.mean(data_matrix, axis=1)
        features['row_std'] = np.std(data_matrix, axis=1)
        features['row_min'] = np.min(data_matrix, axis=1)
        features['row_max'] = np.max(data_matrix, axis=1)
        features['row_median'] = np.median(data_matrix, axis=1)
        
        # Advanced statistics
        row_means = np.mean(data_matrix, axis=1)
        row_stds = np.std(data_matrix, axis=1)
        features['row_cv'] = row_stds / (row_means + 1e-8)  # Coefficient of variation
        
        # Shape statistics
        features['row_skew'] = stats.skew(data_matrix, axis=1)
        features['row_kurtosis'] = stats.kurtosis(data_matrix, axis=1)
        
    except Exception as e:
        print(f"Warning in statistical feature creation: {e}")
    
    return features



In [55]:
# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================


In [56]:

def clean_features(df):
    """
    Cleans feature DataFrame by handling infinite values and ensuring numeric types.
    
    Args:
        df: Input DataFrame
    
    Returns:
        pandas DataFrame: Cleaned DataFrame
    """
    # Replace infinite values with NaN, then fill with 0
    df_clean = df.replace([np.inf, -np.inf], np.nan).fillna(0)
    
    # Ensure all columns are numeric
    for col in df_clean.columns:
        if not pd.api.types.is_numeric_dtype(df_clean[col]):
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce').fillna(0)
    
    return df_clean

In [57]:

# =============================================================================
# COMPREHENSIVE PIPELINE FUNCTIONS
# =============================================================================


In [58]:

def fit_all_feature_extractors(X_train, 
                              anomaly_params=None, 
                              clustering_params=None):
    """
    Fits all feature extraction models on training data.
    
    Args:
        X_train: Training data (pandas DataFrame)
        anomaly_params: Parameters for anomaly detection (dict)
        clustering_params: Parameters for clustering (dict)
    
    Returns:
        dict: All fitted models and encoders
    """
    # Set default parameters
    anomaly_params = anomaly_params or {}
    clustering_params = clustering_params or {}
    
    # Fit all models
    fitted_models = {}
    
    # Fit anomaly detector
    fitted_models['anomaly'] = fit_anomaly_detector(X_train, **anomaly_params)
    
    # Fit clustering model
    fitted_models['clustering'] = fit_clustering_model(X_train, **clustering_params)
    
    return fitted_models


In [59]:

def transform_with_all_features(X, fitted_models):
    """
    Transforms data using all fitted feature extractors.
    
    Args:
        X: Input data (pandas DataFrame)
        fitted_models: Dictionary from fit_all_feature_extractors()
    
    Returns:
        pandas DataFrame: Original data + all generated features
    """
    # Start with original data
    enhanced_features = X.copy()
    
    # Add anomaly features
    anomaly_features = create_anomaly_features(X, fitted_models['anomaly'])
    enhanced_features = pd.concat([enhanced_features, anomaly_features], axis=1)
    
    # Add clustering features
    clustering_features = create_clustering_features(X, fitted_models['clustering'])
    enhanced_features = pd.concat([enhanced_features, clustering_features], axis=1)
    
    # Add statistical features
    statistical_features = create_statistical_features(X)
    enhanced_features = pd.concat([enhanced_features, statistical_features], axis=1)
    
    # Clean features
    enhanced_features = clean_features(enhanced_features)
    
    return enhanced_features

In [60]:


def fit_transform_features(X_train, **kwargs):
    """
    Convenience function to fit and transform in one step.
    
    Args:
        X_train: Training data (pandas DataFrame)
        **kwargs: Parameters for different feature extractors
    
    Returns:
        tuple: (transformed_features, fitted_models)
    """
    # Fit all extractors
    fitted_models = fit_all_feature_extractors(X_train, **kwargs)
    
    # Transform training data
    transformed_features = transform_with_all_features(X_train, fitted_models)
    
    return transformed_features, fitted_models

In [61]:


def create_enhanced_dataset(df, target_column='Class', **kwargs):
    """
    Complete workflow: takes full DataFrame, returns enhanced training DataFrame with target.
    
    Args:
        df: Complete DataFrame with target column (pandas DataFrame)
        target_column: Name of target column (default: 'Class')
        **kwargs: Parameters for feature extractors
    
    Returns:
        tuple: (enhanced_train_df_with_target, fitted_models, X_test_ready_function)
        - enhanced_train_df_with_target: Complete DataFrame with original + new features + target
        - fitted_models: Fitted models for transforming test data
        - transform_test_data: Function to apply same transformations to test data
    """
    # Separate features and target
    X_train = df.drop(columns=[target_column])
    y_train = df[target_column]
    
    # Fit and transform features
    X_train_enhanced, fitted_models = fit_transform_features(X_train, **kwargs)
    
    # Combine enhanced features with target
    enhanced_train_df = X_train_enhanced.copy()
    enhanced_train_df[target_column] = y_train
    
    # Create function for test data transformation
    def transform_test_data(X_test):
        """Transform test data using fitted models"""
        return transform_with_all_features(X_test, fitted_models)
    
    return enhanced_train_df, fitted_models, transform_test_dat

In [65]:

print("Method 1: Separate fit and transform")

# Fit on training data
fitted_models = fit_all_feature_extractors(X_train)

# Transform training data
X_train_enhanced = transform_with_all_features(X_train, fitted_models)



print(f"Original training shape: {X_train.shape}")
print(f"Enhanced training shape: {X_train_enhanced.shape}")

anomaly_model = fit_anomaly_detector(X_train)
clustering_model = fit_clustering_model(X_train)

anomaly_features = create_anomaly_features(X_train, anomaly_model)
clustering_features = create_clustering_features(X_train, clustering_model)
statistical_features = create_statistical_features(X_train)
print(f"Anomaly features: {anomaly_features.shape}")
print(f"Clustering features: {clustering_features.shape}")
print(f"Statistical features: {statistical_features.shape}")


Method 1: Separate fit and transform
Original training shape: (170436, 30)
Enhanced training shape: (170436, 50)
Anomaly features: (170436, 2)
Clustering features: (170436, 10)
Statistical features: (170436, 8)


In [63]:
df_eda = pd.concat([X_train_enhanced, y_train], axis=1)

In [64]:
df_eda.to_csv(r"../data/df_eda.csv", index=False)