# Data Processing for Nigeria: Climate, CO2, and Crop Yields

This notebook loads raw CSVs, cleans missing values, aggregates monthly climate to annual features, and writes processed datasets ready for LSTM, FNN and hybrid models.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
base = Path('data')


In [None]:
climate_monthly = base / 'lstm_dataset' / 'raw' / 'regional_monthly_climate_nigeria.csv'
climate_annual = base / 'fnn_dataset' / 'processed' / 'regional_annual_climate_features.csv'
co2_csv = base / 'fnn_dataset' / 'raw' / 'nigeria_co2_emissions.csv'
crop_csv = base / 'fnn_dataset' / 'raw' / 'nigeria_crop_yields.csv'


In [None]:
df_monthly = pd.read_csv(climate_monthly)
df_annual = pd.read_csv(climate_annual) if climate_annual.exists() else pd.DataFrame()
df_co2 = pd.read_csv(co2_csv)
df_crops = pd.read_csv(crop_csv)
print('Monthly', df_monthly.shape)
print('Annual', df_annual.shape)
print('CO2', df_co2.shape)
print('Crops', df_crops.shape)


## Cleaning: ensure numeric types and consistent column names

In [None]:
# Standardize column names
df_monthly = df_monthly.rename(columns={'PRECTOTCORR':'rainfall_mm','T2M':'temp_avg_c','T2M_MAX':'temp_max_c','T2M_MIN':'temp_min_c'})
# Ensure numeric
for c in ['rainfall_mm','temp_avg_c','temp_max_c','temp_min_c']:
    if c in df_monthly.columns:
        df_monthly[c] = pd.to_numeric(df_monthly[c], errors='coerce')
df_co2['CO2_Emissions_kt'] = pd.to_numeric(df_co2['CO2_Emissions_kt'], errors='coerce')
# Crop yields column rename if present
if 'Yield' in df_crops.columns:
    df_crops = df_crops.rename(columns={'Yield':'yield_kg_ha'})


## Aggregation: Monthly -> Annual features per region

In [None]:
agg = df_monthly.groupby(['Region','Year']).agg(annual_rainfall_mm=('rainfall_mm', 'sum'), annual_mean_temp_c=('temp_avg_c','mean'), annual_max_temp_c=('temp_max_c','max')).reset_index()
agg.to_csv(climate_annual, index=False)
print('Saved aggregated annual climate to', climate_annual)


## Prepare FNN features and save

In [None]:
# Example: merge climate annual (regional) with national CO2 by Year
fnn_features = agg.groupby('Year').agg({'annual_rainfall_mm':'mean','annual_mean_temp_c':'mean','annual_max_temp_c':'mean'}).reset_index()
fnn_features = fnn_features.merge(df_co2, on='Year', how='left')
fnn_out = base / 'fnn_dataset' / 'processed' / 'fnn_features.csv'
fnn_out.parent.mkdir(parents=True, exist_ok=True)
fnn_features.to_csv(fnn_out, index=False)
print('Saved FNN features to', fnn_out)


## Prepare LSTM sequences (example)

In [None]:
# This is a simple example: pivot monthly data into sequences per Region
seq_dir = base / 'lstm_dataset' / 'sequences'
seq_dir.mkdir(parents=True, exist_ok=True)
# Save a CSV of monthly sequences as-is for downstream processing
seq_out = seq_dir / 'lstm_monthly_sequences.csv'
df_monthly.to_csv(seq_out, index=False)
print('Saved LSTM sequences (monthly) to', seq_out)


## Hybrid dataset: combine LSTM and FNN features for hybrid models

In [None]:
hybrid_fnn_out = base / 'hybrid_dataset' / 'fnn_dataset' / 'processed' / 'hybrid_fnn_features.csv'
hybrid_fnn_out.parent.mkdir(parents=True, exist_ok=True)
fnn_features.to_csv(hybrid_fnn_out, index=False)
print('Saved hybrid FNN features to', hybrid_fnn_out)


## Metadata
Feature descriptions and placeholder scaling parameters are saved in each dataset `metadata/` folder.

In [None]:
print('FNN features head:')
print(fnn_features.head())
print('Monthly head:')
print(df_monthly.head())
