In [30]:
# 02_feature_engineering.ipynb

# --- IMPORTS ---
import os
import pandas as pd

# --- PATH SETUP ---
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))  # notebooks/ -> project root
DATA_RAW = os.path.join(PROJECT_ROOT, "data", "raw")
DATA_PROCESSED = os.path.join(PROJECT_ROOT, "data", "processed")

# --- LOAD DATA ---
# Load weather data
weather_df = pd.read_csv(os.path.join(DATA_RAW, "GlobalWeatherRepository.csv"))

# Load air quality data
air_quality_df = pd.read_csv(os.path.join(DATA_PROCESSED, "cleaned_air_quality_data.csv"))

# --- INSPECT DATA ---
print("Weather columns:", weather_df.columns)
print("Air quality columns:", air_quality_df.columns)

# --- FEATURE ENGINEERING ---

# ----- WEATHER FEATURES -----
# Example: Temperature difference (Celsius vs Feels like)
if 'feels_like_celsius' in weather_df.columns and 'temperature_celsius' in weather_df.columns:
    weather_df['temp_feels_diff'] = weather_df['temperature_celsius'] - weather_df['feels_like_celsius']

# ----- AIR QUALITY FEATURES -----
# PM2.5 / PM10 ratio
air_quality_df['pm25_pm10_ratio'] = air_quality_df['air_quality_PM2.5'] / air_quality_df['air_quality_PM10']

# CO / NO2 ratio
air_quality_df['co_no2_ratio'] = air_quality_df['air_quality_Carbon_Monoxide'] / air_quality_df['air_quality_Nitrogen_dioxide']

# AQI normalization example (scale between 0 and 1)
aqi_cols = ['air_quality_PM2.5', 'air_quality_PM10', 'air_quality_Carbon_Monoxide', 
            'air_quality_Ozone', 'air_quality_Nitrogen_dioxide', 'air_quality_Sulphur_dioxide']
for col in aqi_cols:
    if col in air_quality_df.columns:
        air_quality_df[col + '_norm'] = (air_quality_df[col] - air_quality_df[col].min()) / (air_quality_df[col].max() - air_quality_df[col].min())

# ----- MERGE FEATURES -----
# Since there is no timestamp, merge by index (row-wise) only if same number of rows
if len(weather_df) == len(air_quality_df):
    features_df = pd.concat([weather_df.reset_index(drop=True), air_quality_df.reset_index(drop=True)], axis=1)
else:
    print("Warning: Cannot merge weather and air quality by index. Saving separate features.")
    features_df = air_quality_df.copy()  # fallback to air quality features only

# --- SAVE FEATURES ---
features_csv_path = os.path.join(DATA_PROCESSED, "features_final.csv")
features_df.to_csv(features_csv_path, index=False)
print(f"Features saved to: {features_csv_path}")
print("Features shape:", features_df.shape)

# --- QUICK INSPECTION ---
features_df.head()


Weather columns: Index(['country', 'location_name', 'latitude', 'longitude', 'timezone',
       'last_updated_epoch', 'last_updated', 'temperature_celsius',
       'temperature_fahrenheit', 'condition_text', 'wind_mph', 'wind_kph',
       'wind_degree', 'wind_direction', 'pressure_mb', 'pressure_in',
       'precip_mm', 'precip_in', 'humidity', 'cloud', 'feels_like_celsius',
       'feels_like_fahrenheit', 'visibility_km', 'visibility_miles',
       'uv_index', 'gust_mph', 'gust_kph', 'air_quality_Carbon_Monoxide',
       'air_quality_Ozone', 'air_quality_Nitrogen_dioxide',
       'air_quality_Sulphur_dioxide', 'air_quality_PM2.5', 'air_quality_PM10',
       'air_quality_us-epa-index', 'air_quality_gb-defra-index', 'sunrise',
       'sunset', 'moonrise', 'moonset', 'moon_phase', 'moon_illumination'],
      dtype='object')
Air quality columns: Index(['country', 'location_name', 'latitude', 'longitude', 'timezone',
       'temperature_celsius', 'condition_text', 'wind_kph', 'wind_degree'

Unnamed: 0,country,location_name,latitude,longitude,timezone,last_updated_epoch,last_updated,temperature_celsius,temperature_fahrenheit,condition_text,...,air_quality_PM10,air_quality_us-epa-index,pm25_pm10_ratio,co_no2_ratio,air_quality_PM2.5_norm,air_quality_PM10_norm,air_quality_Carbon_Monoxide_norm,air_quality_Ozone_norm,air_quality_Nitrogen_dioxide_norm,air_quality_Sulphur_dioxide_norm
0,Afghanistan,Kabul,34.52,69.18,Asia/Kabul,1715849100,2024-05-16 13:15,26.6,79.8,Partly Cloudy,...,26.6,1,0.315789,251.818182,0.005101,0.237748,0.210236,0.214271,0.002572,0.950464
1,Albania,Tirana,41.33,19.82,Europe/Tirane,1715849100,2024-05-16 10:45,19.0,66.2,Partly cloudy,...,2.0,1,0.55,215.111111,0.000577,0.234629,0.20853,0.202413,0.002104,0.950455
2,Algeria,Algiers,36.76,3.05,Africa/Algiers,1715849100,2024-05-16 09:45,23.0,73.4,Sunny,...,18.4,1,0.565217,8.305684,0.00634,0.236708,0.215631,0.02538,0.152209,0.951719
3,Andorra,Andorra La Vella,42.5,1.52,Europe/Andorra,1715849100,2024-05-16 10:45,6.3,43.3,Light drizzle,...,0.9,1,0.777778,106.375,0.00033,0.234489,0.208051,0.133971,0.003741,0.950464
4,Angola,Luanda,-8.84,13.23,Africa/Luanda,1715849100,2024-05-16 09:45,26.0,78.8,Partly cloudy,...,262.3,5,0.699199,40.770289,0.113531,0.267639,0.265209,0.039526,0.169979,0.95344
