# Massive Data Augmentation Pipeline

## Ziel:
- Original: ~16,500 Training Samples
- **Output: ~300,000+ Samples**

## Augmentation Strategien:
1. **Multi-Level Gaussian Noise** (10x)
2. **SMOTE/ADASYN** (Synthetic Minority Oversampling)
3. **Mixup Augmentation**
4. **Feature-wise Perturbation**

## Warum mehr Daten?
- Bessere Generalisierung
- Robusteres Lernen
- Verhindert Overfitting
- Deep Learning braucht viele Daten!

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from imblearn.over_sampling import SMOTE, ADASYN
import warnings
warnings.filterwarnings('ignore')

print("âœ“ Libraries loaded")
print("\nðŸ“Š Data Augmentation Pipeline Starting...")

âœ“ Libraries loaded

ðŸ“Š Data Augmentation Pipeline Starting...


In [3]:
# ===== LOAD DATA + v3 FEATURE ENGINEERING =====
housing = pd.read_csv("../housing.csv")
print(f"Original Data: {housing.shape}")

# Geografisches Clustering
kmeans = KMeans(n_clusters=15, random_state=42, n_init=10)
housing['geo_cluster'] = kmeans.fit_predict(housing[['latitude', 'longitude']])

# KNN Nachbarschafts-Features
knn = NearestNeighbors(n_neighbors=11)
knn.fit(housing[['latitude', 'longitude']])
distances, indices = knn.kneighbors(housing[['latitude', 'longitude']])

neighbor_prices = []
neighbor_income = []
for idx_list in indices:
    neighbor_idx = idx_list[1:]
    neighbor_prices.append(housing.iloc[neighbor_idx]['median_house_value'].mean())
    neighbor_income.append(housing.iloc[neighbor_idx]['median_income'].mean())

housing['avg_neighbor_price'] = neighbor_prices
housing['avg_neighbor_income'] = neighbor_income
housing['avg_neighbor_distance'] = distances[:, 1:].mean(axis=1)

print("âœ“ Geographic features added")

Original Data: (20640, 10)
âœ“ Geographic features added


In [4]:
# ===== COMPLETE v3 FEATURE ENGINEERING =====
def create_v3_features(df):
    df = df.copy()
    
    # Basis Features
    df['rooms_per_household'] = df['total_rooms'] / df['households']
    df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
    df['population_per_household'] = df['population'] / df['households']
    df['rooms_per_person'] = df['total_rooms'] / (df['population'] + 1)
    df['bedrooms_per_household'] = df['total_bedrooms'] / df['households']
    
    # Polynomial
    df['median_income_squared'] = df['median_income'] ** 2
    df['median_income_cubed'] = df['median_income'] ** 3
    df['age_squared'] = df['housing_median_age'] ** 2
    
    # Interactions
    df['income_per_room'] = df['median_income'] / (df['total_rooms'] + 1)
    df['income_per_person'] = df['median_income'] / (df['population'] + 1)
    df['income_times_age'] = df['median_income'] * df['housing_median_age']
    df['lat_long'] = df['latitude'] * df['longitude']
    
    # Log Transforms
    df['log_total_rooms'] = np.log1p(df['total_rooms'])
    df['log_population'] = np.log1p(df['population'])
    df['log_median_income'] = np.log1p(df['median_income'])
    
    # City Distances
    cities = {
        'sf': (37.77, -122.41),
        'la': (34.05, -118.24),
        'san_diego': (32.72, -117.16),
        'sacramento': (38.58, -121.49)
    }
    
    for city_name, (lat, lon) in cities.items():
        df[f'distance_to_{city_name}'] = np.sqrt(
            (df['latitude'] - lat)**2 + (df['longitude'] - lon)**2
        )
    
    distance_cols = [f'distance_to_{city}' for city in cities.keys()]
    df['min_distance_to_city'] = df[distance_cols].min(axis=1)
    
    # Economic
    df['is_coastal'] = df['ocean_proximity'].isin(['NEAR BAY', 'NEAR OCEAN', '<1H OCEAN']).astype(int)
    df['wealth_index'] = df['median_income'] * df['rooms_per_household'] * (1 + df['is_coastal'] * 0.3)
    df['population_density'] = df['population'] / (df['total_rooms'] + 1)
    df['quality_score'] = (
        df['rooms_per_household'] * 0.3 +
        df['median_income'] * 0.5 +
        df['is_coastal'] * 0.2
    )
    
    # Age Features
    df['is_new'] = (df['housing_median_age'] <= 10).astype(int)
    df['is_old'] = (df['housing_median_age'] >= 40).astype(int)
    
    # Binning
    df['lat_bin'] = pd.cut(df['latitude'], bins=10, labels=False)
    df['long_bin'] = pd.cut(df['longitude'], bins=10, labels=False)
    
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    return df

housing = create_v3_features(housing)
print(f"âœ“ Feature Engineering Complete: {housing.shape[1]} features")

âœ“ Feature Engineering Complete: 42 features


In [5]:
# ===== PREPARE FOR AUGMENTATION =====
X = housing.drop('median_house_value', axis=1)
y = housing['median_house_value']

# Train/Test Split (nur Train augmentieren!)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nðŸ“Š Original Split:")
print(f"  Train: {len(X_train):,} samples")
print(f"  Test:  {len(X_test):,} samples")
print(f"\nðŸš€ Starting Augmentation Pipeline...")


ðŸ“Š Original Split:
  Train: 16,512 samples
  Test:  4,128 samples

ðŸš€ Starting Augmentation Pipeline...


## Augmentation 1: Multi-Level Gaussian Noise

Verschiedene Noise Levels = verschiedene "Perspektiven" auf die Daten

In [6]:
# ===== GAUSSIAN NOISE AUGMENTATION =====
def gaussian_augmentation_multilevel(X, y):
    """
    Multi-Level Gaussian Noise Augmentation
    
    Verschiedene Noise Levels:
    - Low (1%): 3x Kopien - sehr Ã¤hnlich zu Original
    - Medium (3%): 4x Kopien - moderate Variation  
    - High (5%): 2x Kopien - grÃ¶ÃŸere Variation
    - Very High (7%): 1x Kopien - maximale Variation
    """
    # One-Hot Encoding
    X_encoded = pd.get_dummies(X, columns=['ocean_proximity'], drop_first=False)
    
    # Imputation
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy='median')
    X_imputed = pd.DataFrame(
        imputer.fit_transform(X_encoded),
        columns=X_encoded.columns
    )
    
    X_list = [X_imputed.values]
    y_list = [y.values]
    
    # Feature std for realistic noise
    feature_std = np.std(X_imputed.values, axis=0)
    
    configs = [
        {'noise_level': 0.01, 'copies': 3, 'y_var': 0.005},  # 1% noise
        {'noise_level': 0.03, 'copies': 4, 'y_var': 0.01},   # 3% noise
        {'noise_level': 0.05, 'copies': 2, 'y_var': 0.015},  # 5% noise
        {'noise_level': 0.07, 'copies': 1, 'y_var': 0.02},   # 7% noise
    ]
    
    for config in configs:
        for _ in range(config['copies']):
            # X noise
            X_noise = np.random.normal(0, config['noise_level'], X_imputed.shape) * feature_std
            X_noisy = X_imputed.values + X_noise
            
            # Y noise
            y_noise = np.random.normal(1.0, config['y_var'], y.shape)
            y_noisy = y.values * y_noise
            
            X_list.append(X_noisy)
            y_list.append(y_noisy)
    
    X_augmented = np.vstack(X_list)
    y_augmented = np.hstack(y_list)
    
    return X_augmented, y_augmented, X_encoded.columns

print("\n[1/4] Applying Multi-Level Gaussian Noise...")
X_gauss, y_gauss, feature_names = gaussian_augmentation_multilevel(X_train, y_train)
print(f"  Original: {len(X_train):,} â†’ Gaussian: {len(X_gauss):,} (+{len(X_gauss)-len(X_train):,})")
print(f"  Augmentation Factor: {len(X_gauss)/len(X_train):.1f}x")


[1/4] Applying Multi-Level Gaussian Noise...
  Original: 16,512 â†’ Gaussian: 181,632 (+165,120)
  Augmentation Factor: 11.0x


## Augmentation 2: SMOTE (Synthetic Minority Over-sampling)

Generiert synthetische Samples durch Interpolation zwischen Ã¤hnlichen Samples

In [7]:
# ===== SMOTE AUGMENTATION =====
print("\n[2/4] Applying SMOTE...")

# FÃ¼r Regression: Binne Target fÃ¼r SMOTE
y_binned = pd.qcut(y_gauss, q=10, labels=False, duplicates='drop')

try:
    smote = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=42)
    X_smote, y_binned_smote = smote.fit_resample(X_gauss, y_binned)
    
    # Rekonstruiere kontinuierliche y Werte
    # FÃ¼r jedes neue Sample: durchschnitt von k nÃ¤chsten Nachbarn
    from sklearn.neighbors import NearestNeighbors
    knn_reg = NearestNeighbors(n_neighbors=5)
    knn_reg.fit(X_gauss)
    
    # Nur neue Samples (nach len(X_gauss))
    n_new = len(X_smote) - len(X_gauss)
    X_new_smote = X_smote[len(X_gauss):]
    
    y_new_smote = []
    for x_new in X_new_smote:
        distances, indices = knn_reg.kneighbors([x_new])
        y_new_smote.append(y_gauss[indices[0]].mean())
    
    # Kombiniere
    X_smote_final = np.vstack([X_gauss, X_new_smote])
    y_smote_final = np.hstack([y_gauss, np.array(y_new_smote)])
    
    print(f"  Before SMOTE: {len(X_gauss):,} â†’ After: {len(X_smote_final):,} (+{n_new:,})")
    
    X_current = X_smote_final
    y_current = y_smote_final
    
except Exception as e:
    print(f"  SMOTE failed: {e}")
    print(f"  Continuing without SMOTE...")
    X_current = X_gauss
    y_current = y_gauss


[2/4] Applying SMOTE...
  Before SMOTE: 181,632 â†’ After: 181,640 (+8)


## Augmentation 3: Mixup

Mischt zwei Samples um glattere Decision Boundaries zu lernen

In [8]:
# ===== MIXUP AUGMENTATION =====
print("\n[3/4] Applying Mixup...")

def mixup_augmentation(X, y, n_mixup=20000, alpha_range=(0.2, 0.8)):
    """
    Mixup: X_new = alpha * X_i + (1-alpha) * X_j
    """
    X_mixup_list = []
    y_mixup_list = []
    
    n_samples = len(X)
    
    for _ in range(n_mixup):
        # WÃ¤hle zwei zufÃ¤llige Samples
        i = np.random.randint(0, n_samples)
        j = np.random.randint(0, n_samples)
        
        # Random mixing ratio
        alpha = np.random.uniform(alpha_range[0], alpha_range[1])
        
        # Mix
        X_mixed = alpha * X[i] + (1 - alpha) * X[j]
        y_mixed = alpha * y[i] + (1 - alpha) * y[j]
        
        X_mixup_list.append(X_mixed)
        y_mixup_list.append(y_mixed)
    
    X_mixup = np.array(X_mixup_list)
    y_mixup = np.array(y_mixup_list)
    
    # Kombiniere
    X_combined = np.vstack([X, X_mixup])
    y_combined = np.hstack([y, y_mixup])
    
    return X_combined, y_combined

X_mixup, y_mixup = mixup_augmentation(X_current, y_current, n_mixup=30000)
print(f"  Before Mixup: {len(X_current):,} â†’ After: {len(X_mixup):,} (+{len(X_mixup)-len(X_current):,})")

X_current = X_mixup
y_current = y_mixup


[3/4] Applying Mixup...
  Before Mixup: 181,640 â†’ After: 211,640 (+30,000)


## Augmentation 4: Feature-wise Perturbation

StÃ¶re nur spezifische wichtige Features

In [9]:
# ===== FEATURE-WISE PERTURBATION =====
print("\n[4/4] Applying Feature-wise Perturbation...")

def feature_perturbation(X, y, n_perturb=10000, feature_names=None):
    """
    StÃ¶re nur wichtige Features einzeln
    """
    X_perturb_list = []
    y_perturb_list = []
    
    # Wichtige Features (basiert auf v3 Feature Importance)
    important_features = [
        'median_income', 'latitude', 'longitude', 'housing_median_age',
        'total_rooms', 'population', 'households'
    ]
    
    # Finde Indizes (ungefÃ¤hr, da One-Hot encoded)
    for _ in range(n_perturb):
        idx = np.random.randint(0, len(X))
        x_sample = X[idx].copy()
        
        # WÃ¤hle zufÃ¤lliges Feature zum Perturbieren
        feature_idx = np.random.randint(0, min(10, X.shape[1]))  # Nur numerische Features
        
        # Perturbiere mit 5-10%
        perturbation = np.random.normal(1.0, 0.07)
        x_sample[feature_idx] *= perturbation
        
        # Y auch leicht variieren
        y_perturbed = y[idx] * np.random.normal(1.0, 0.02)
        
        X_perturb_list.append(x_sample)
        y_perturb_list.append(y_perturbed)
    
    X_combined = np.vstack([X, np.array(X_perturb_list)])
    y_combined = np.hstack([y, np.array(y_perturb_list)])
    
    return X_combined, y_combined

X_final, y_final = feature_perturbation(X_current, y_current, n_perturb=15000)
print(f"  Before: {len(X_current):,} â†’ After: {len(X_final):,} (+{len(X_final)-len(X_current):,})")


[4/4] Applying Feature-wise Perturbation...
  Before: 211,640 â†’ After: 226,640 (+15,000)


In [10]:
# ===== FINAL SUMMARY =====
print("\n" + "="*60)
print("DATA AUGMENTATION COMPLETE")
print("="*60)
print(f"Original Train Samples:  {len(X_train):>10,}")
print(f"After Augmentation:      {len(X_final):>10,}")
print(f"Augmentation Factor:     {len(X_final)/len(X_train):>10.1f}x")
print(f"Features:                {X_final.shape[1]:>10}")
print("="*60)

# Save to file
np.save('X_train_augmented_300k.npy', X_final)
np.save('y_train_augmented_300k.npy', y_final)
np.save('feature_names.npy', feature_names.values)

print("\nâœ“ Saved:")
print("  - X_train_augmented_300k.npy")
print("  - y_train_augmented_300k.npy")
print("  - feature_names.npy")

print("\nðŸŽ¯ Ready for Neural Network Training!")


DATA AUGMENTATION COMPLETE
Original Train Samples:      16,512
After Augmentation:         226,640
Augmentation Factor:           13.7x
Features:                        45

âœ“ Saved:
  - X_train_augmented_300k.npy
  - y_train_augmented_300k.npy
  - feature_names.npy

ðŸŽ¯ Ready for Neural Network Training!


In [11]:
# ===== PREPARE TEST SET (NO AUGMENTATION!) =====
X_test_encoded = pd.get_dummies(X_test, columns=['ocean_proximity'], drop_first=False)

# Align columns
for col in feature_names:
    if col not in X_test_encoded.columns:
        X_test_encoded[col] = 0
X_test_encoded = X_test_encoded[feature_names]

# Imputation
from sklearn.impute import SimpleImputer
imputer_test = SimpleImputer(strategy='median')
X_test_final = imputer_test.fit_transform(X_test_encoded)

# Save Test Set
np.save('X_test.npy', X_test_final)
np.save('y_test.npy', y_test.values)

print("âœ“ Test Set saved (no augmentation):")
print(f"  Test Samples: {len(X_test_final):,}")

âœ“ Test Set saved (no augmentation):
  Test Samples: 4,128
