# 02 - Data Preprocessing

This notebook demonstrates data preprocessing techniques for software effort estimation.

## Contents
1. Data loading
2. Handling missing values
3. Feature scaling
4. Feature engineering
5. Train-test split

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('..')

from src.data.data_loader import DataLoader
from src.data.preprocessor import DataPreprocessor
from src.data.feature_engineering import FeatureEngineer, engineer_features
from sklearn.model_selection import train_test_split

%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')

## 1. Load Data

In [None]:
loader = DataLoader('cocomo81')
df = loader.load_raw_data()

print(f"Original data shape: {df.shape}")
df.head()

## 2. Handling Missing Values

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

In [None]:
# Demonstrate missing value handling
preprocessor = DataPreprocessor()

# Create artificial missing values for demonstration
df_with_missing = df.copy()
df_with_missing.iloc[0, 0] = np.nan
df_with_missing.iloc[5, 3] = np.nan

print("Before handling:")
print(df_with_missing.isnull().sum().sum(), "missing values")

# Handle with mean imputation
df_clean = preprocessor.handle_missing_values(df_with_missing, strategy='mean')

print("\nAfter handling:")
print(df_clean.isnull().sum().sum(), "missing values")

## 3. Feature Scaling

In [None]:
# Get features and target
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

print(f"Original X statistics:")
print(f"  Mean: {X.mean():.4f}")
print(f"  Std: {X.std():.4f}")
print(f"  Min: {X.min():.4f}")
print(f"  Max: {X.max():.4f}")

In [None]:
# Standard Scaling
preprocessor = DataPreprocessor()
X_standard = preprocessor.scale_features(X, method='standard')

print(f"\nAfter Standard Scaling:")
print(f"  Mean: {X_standard.mean():.4f} (should be ~0)")
print(f"  Std: {X_standard.std():.4f} (should be ~1)")
print(f"  Min: {X_standard.min():.4f}")
print(f"  Max: {X_standard.max():.4f}")

In [None]:
# MinMax Scaling
preprocessor2 = DataPreprocessor()
X_minmax = preprocessor2.scale_features(X, method='minmax')

print(f"\nAfter MinMax Scaling:")
print(f"  Min: {X_minmax.min():.4f} (should be 0)")
print(f"  Max: {X_minmax.max():.4f} (should be 1)")

In [None]:
# Visualize scaling effect
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].boxplot(X[:, :5])
axes[0].set_title('Original Features (first 5)')
axes[0].set_xticklabels(['rely', 'data', 'cplx', 'time', 'stor'])

axes[1].boxplot(X_standard[:, :5])
axes[1].set_title('Standard Scaled Features (first 5)')
axes[1].set_xticklabels(['rely', 'data', 'cplx', 'time', 'stor'])

axes[2].boxplot(X_minmax[:, :5])
axes[2].set_title('MinMax Scaled Features (first 5)')
axes[2].set_xticklabels(['rely', 'data', 'cplx', 'time', 'stor'])

plt.tight_layout()
plt.savefig('../reports/figures/scaling_comparison.png', dpi=150)
plt.show()

## 4. Feature Engineering

In [None]:
engineer = FeatureEngineer()

# Create EAF feature
X_eaf = engineer.create_eaf_feature(X)
print(f"After adding EAF feature: {X_eaf.shape}")

In [None]:
# Create size-derived features
X_size = engineer.create_size_derived_features(X)
print(f"After adding size features: {X_size.shape}")
print("Added: log_loc, sqrt_loc, loc_squared")

In [None]:
# Feature importance
feature_names = loader.get_feature_names()
importance = engineer.get_feature_importance(X, y, feature_names)
print("\nFeature Importance (Top 10):")
print(importance[['Feature', 'F_Score', 'MI_Score', 'Correlation', 'Avg_Rank']].head(10))

In [None]:
# Visualize feature importance
plt.figure(figsize=(12, 6))
importance_sorted = importance.sort_values('F_Score', ascending=True)
plt.barh(importance_sorted['Feature'], importance_sorted['F_Score'])
plt.xlabel('F-Score')
plt.title('Feature Importance (F-Score)')
plt.tight_layout()
plt.savefig('../reports/figures/feature_importance.png', dpi=150)
plt.show()

## 5. Train-Test Split

In [None]:
# Complete preprocessing pipeline
preprocessor = DataPreprocessor()
X, y = preprocessor.preprocess_pipeline(df, scale=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Data Split:")
print(f"  Training set: {X_train.shape[0]} samples")
print(f"  Test set: {X_test.shape[0]} samples")
print(f"\nFeatures: {X_train.shape[1]}")

In [None]:
# Verify target distribution in splits
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(y_train, bins=15, edgecolor='black', alpha=0.7)
axes[0].set_title(f'Training Set (n={len(y_train)})')
axes[0].set_xlabel('Effort')
axes[0].set_ylabel('Frequency')

axes[1].hist(y_test, bins=15, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_title(f'Test Set (n={len(y_test)})')
axes[1].set_xlabel('Effort')

plt.tight_layout()
plt.show()

## Summary

### Preprocessing Steps Applied:
1. **Missing Value Handling**: Mean imputation
2. **Feature Scaling**: StandardScaler (zero mean, unit variance)
3. **Feature Engineering**: EAF calculation, size-derived features
4. **Train-Test Split**: 80-20 split with random state for reproducibility