In [None]:
# Feature Engineering for Water Pump Classification
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
# Load dataset (adjust path as needed)
data_path = '../data/cleaned_data_filled_V5.csv'
labels_path = '../data/train_labels.csv'
df = pd.read_csv(data_path)
labels = pd.read_csv(labels_path)
df = pd.merge(df, labels, on='id', how='inner')

# -------- Ticket 2.1.1: New Features --------
# Age of pump
current_year = 2025
df['pump_age'] = current_year - df['construction_year']
df['pump_age'] = df['pump_age'].replace(current_year, np.nan)  # replace zero years with NaN

# Combined water quality and quantity
df['quality_quantity'] = df['water_quality'].astype(str) + '_' + df['quantity'].astype(str)

# -------- Ticket 2.1.2: Transform Features --------
# Log transform skewed features
df['log_amount_tsh'] = np.log1p(df['amount_tsh'])
df['log_population'] = np.log1p(df['population'])

# Bin age
df['age_bin'] = pd.cut(df['pump_age'], bins=[0, 10, 20, 30, 50, 100],
                       labels=['0-10', '10-20', '20-30', '30-50', '50+'])

# -------- Ticket 2.1.3: Handle Categorical Features --------
# Frequency encoding for high-cardinality variables
for col in ['funder', 'installer']:
    freq_encoding = df[col].value_counts().to_dict()
    df[col + '_freq'] = df[col].map(freq_encoding)

# One-hot encode medium-cardinality categorical features
medium_card_cols = ['water_quality', 'quantity', 'source', 'basin']
df = pd.get_dummies(df, columns=medium_card_cols, drop_first=True)

# -------- Ticket 2.1.4: Feature Interaction Terms --------
# Interaction between amount and age
df['age_times_amount'] = df['pump_age'] * df['amount_tsh']

# Interaction of population and gps height
df['pop_gps_interaction'] = df['population'] * df['gps_height']

# -------- Ticket 2.1.5: Documentation (Partial Example) --------
feature_doc = [
    {'feature': 'pump_age', 'description': 'Years since pump construction', 'reason': 'Older pumps may fail more often'},
    {'feature': 'quality_quantity', 'description': 'Combined quality and quantity', 'reason': 'May reflect usability or risk'},
    {'feature': 'log_amount_tsh', 'description': 'Log-transformed amount of water', 'reason': 'Reduces skew'},
    {'feature': 'funder_freq', 'description': 'Frequency of funder names', 'reason': 'Avoid high-cardinality problems'},
    {'feature': 'age_times_amount', 'description': 'Interaction of age and water amount', 'reason': 'Can show degradation impact'}
]
feature_doc_df = pd.DataFrame(feature_doc)
print("\nFeature Engineering Documentation:\n")
print(feature_doc_df)

# Save engineered dataset
output_path = '../data/engineered_data_V1.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)
print("\n✅ Feature engineering complete. Dataset saved.")