# USG Failure Prediction - Feature Engineering

**Objective:** Transform raw data into predictive features

**Key Techniques:**
- Batch quality indicators
- Interaction features
- Supplier-based aggregates
- Anomaly scores
- Time-series features

In [None]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Import preprocessing pipeline
from preprocessing import USGPreprocessingPipeline

# Configure
np.random.seed(42)
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

print(f"Feature Engineering started: {datetime.now()}")

## 1. Load Data

In [None]:
# Load raw data
DATA_PATH = '../data/raw/USG_Data_cleared.csv'

try:
    df = pd.read_csv(DATA_PATH)
    print(f"✓ Data loaded: {df.shape}")
    
    # Separate features and target
    if 'Warranty_Claim' in df.columns:
        X = df.drop('Warranty_Claim', axis=1)
        y = df['Warranty_Claim']
        print(f"✓ Features: {X.shape[1]}, Target distribution:")
        print(y.value_counts())
    else:
        print("⚠ Warning: Warranty_Claim column not found")
        X = df.copy()
        y = None
        
except FileNotFoundError:
    print(f"⚠ Data file not found at {DATA_PATH}")
    X, y = None, None

## 2. Initialize Preprocessing Pipeline

In [None]:
if X is not None:
    # Create preprocessing pipeline
    preprocessor = USGPreprocessingPipeline(seed=42)
    
    print("Preprocessing Pipeline Initialized")
    print("="*60)
    print("Pipeline Components:")
    print("  1. Data leakage removal")
    print("  2. Batch feature engineering")
    print("  3. Interaction feature creation")
    print("  4. Supplier-based encoding")
    print("  5. Time-series features")
    print("  6. Categorical encoding")
    print("  7. Anomaly score generation")
    print("  8. Missing value imputation")
    print("  9. Feature scaling")
    print("="*60)

## 3. Apply Feature Engineering

In [None]:
if X is not None and y is not None:
    # Fit and transform
    print("Applying feature engineering pipeline...\n")
    X_transformed = preprocessor.fit_transform(X, y)
    
    print(f"\n✓ Transformation complete")
    print(f"  Original features: {X.shape[1]}")
    print(f"  Engineered features: {X_transformed.shape[1]}")
    print(f"  New features created: {X_transformed.shape[1] - X.shape[1]}")
    
    # Display sample
    print("\nSample of transformed data:")
    display(X_transformed.head())

## 4. Feature Analysis

In [None]:
if X is not None and 'X_transformed' in locals():
    # Get feature names
    feature_names = preprocessor.get_feature_names()
    
    # Identify engineered features
    engineered_features = [
        feat for feat in feature_names 
        if any(keyword in feat for keyword in 
               ['_x_', '_div_', 'Batch_Age', 'Batch_Failure_Rate', 
                'Batch_Size', 'Failure_Rate', 'Anomaly', 'Serial_Position'])
    ]
    
    print(f"\nEngineered Features ({len(engineered_features)}):")
    print("-" * 60)
    for feat in engineered_features:
        print(f"  - {feat}")

In [None]:
if 'X_transformed' in locals() and 'engineered_features' in locals():
    # Visualize engineered features distribution
    if len(engineered_features) > 0:
        n_features = min(9, len(engineered_features))
        
        fig, axes = plt.subplots(3, 3, figsize=(18, 12))
        axes = axes.ravel()
        
        for idx, feat in enumerate(engineered_features[:n_features]):
            if feat in X_transformed.columns:
                axes[idx].hist(X_transformed[feat].dropna(), bins=50, 
                             color='steelblue', edgecolor='black')
                axes[idx].set_title(feat, fontweight='bold')
                axes[idx].set_xlabel('Value')
                axes[idx].set_ylabel('Frequency')
        
        for idx in range(n_features, 9):
            axes[idx].axis('off')
        
        plt.tight_layout()
        plt.savefig('../reports/visualizations/engineered_features.png', 
                   dpi=300, bbox_inches='tight')
        plt.show()

## 5. Feature Correlation with Target

In [None]:
if 'X_transformed' in locals() and y is not None:
    # Calculate correlation with target
    y_binary = (y == 'Yes').astype(int)
    
    # Combine for correlation
    df_with_target = X_transformed.copy()
    df_with_target['Target'] = y_binary
    
    # Calculate correlations
    correlations = df_with_target.corr()['Target'].drop('Target')
    correlations = correlations.abs().sort_values(ascending=False)
    
    print("Top 20 Features by Correlation with Target:")
    print("="*60)
    display(correlations.head(20))
    
    # Visualization
    plt.figure(figsize=(12, 8))
    correlations.head(20).plot(kind='barh', color='teal')
    plt.title('Top 20 Features by Correlation with Warranty Claim', 
             fontsize=14, fontweight='bold')
    plt.xlabel('Absolute Correlation')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.savefig('../reports/visualizations/feature_target_correlation.png', 
               dpi=300, bbox_inches='tight')
    plt.show()

## 6. Save Processed Data

In [None]:
if 'X_transformed' in locals() and y is not None:
    # Save processed data
    X_transformed.to_csv('../data/processed/X_processed.csv', index=False)
    y.to_csv('../data/processed/y_target.csv', index=False)
    
    print("✓ Processed data saved to data/processed/")
    print(f"  - X_processed.csv: {X_transformed.shape}")
    print(f"  - y_target.csv: {y.shape}")
    
    # Save preprocessor
    import joblib
    joblib.dump(preprocessor, '../models/preprocessor.pkl')
    print("\n✓ Preprocessor saved to models/preprocessor.pkl")

## 7. Summary

In [None]:
if 'X_transformed' in locals():
    print("="*80)
    print("FEATURE ENGINEERING SUMMARY")
    print("="*80)
    print(f"\nOriginal Features: {X.shape[1]}")
    print(f"Engineered Features: {X_transformed.shape[1]}")
    print(f"Total New Features: {X_transformed.shape[1] - X.shape[1]}")
    
    print("\nFeature Engineering Techniques Applied:")
    print("  ✓ Batch quality indicators (age, failure rate, size)")
    print("  ✓ Interaction features (multiplicative & ratio)")
    print("  ✓ Supplier failure rate encoding")
    print("  ✓ Anomaly detection scores")
    print("  ✓ Time-series features from serial numbers")
    print("  ✓ Label encoding for categoricals")
    print("  ✓ Missing value imputation")
    print("  ✓ Feature scaling (standardization)")
    
    print("\nNext Steps:")
    print("  → Model Training with XGBoost (Notebook 03)")
    print("  → Hyperparameter optimization with Optuna")
    print("  → Ensemble methods")
    
    print("\n" + "="*80)
    print(f"Feature engineering completed: {datetime.now()}")
    print("="*80)