In [2]:
# Import Required Libraries
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Environment setup complete.")

Environment setup complete.


## 1. Define Paths and Load Processed Data

In [3]:
# Define project paths
BASE_DIR = Path(r'c:\Users\miray\Desktop\dengue_forecasting_project')
PROCESSED_DIR = BASE_DIR / 'data' / 'processed'

# Load processed datasets
weather_file = PROCESSED_DIR / 'weather_weekly_wide.csv'
dengue_file = PROCESSED_DIR / 'dengue_weekly_wide.csv'
trends_file = PROCESSED_DIR / 'search_trends_weekly_wide.csv'

print(f"Base Directory: {BASE_DIR}")
print(f"Processed Data Directory: {PROCESSED_DIR}")
print(f"\nLoading files...")

# Load with Date as index
df_weather = pd.read_csv(weather_file, parse_dates=['Date'], index_col='Date')
df_dengue = pd.read_csv(dengue_file, parse_dates=['Date'], index_col='Date')
df_trends = pd.read_csv(trends_file, parse_dates=['Date'], index_col='Date')

print(f"‚úÖ Weather data: {df_weather.shape}")
print(f"‚úÖ Dengue data: {df_dengue.shape}")
print(f"‚úÖ Search trends data: {df_trends.shape}")

Base Directory: c:\Users\miray\Desktop\dengue_forecasting_project
Processed Data Directory: c:\Users\miray\Desktop\dengue_forecasting_project\data\processed

Loading files...
‚úÖ Weather data: (526, 40)
‚úÖ Dengue data: (328, 8)
‚úÖ Search trends data: (362, 4)


## 2. Inspect Data Alignment

In [4]:
print("Date Range Comparison:")
print(f"  Weather:  {df_weather.index.min()} to {df_weather.index.max()} ({len(df_weather)} weeks)")
print(f"  Dengue:   {df_dengue.index.min()} to {df_dengue.index.max()} ({len(df_dengue)} weeks)")
print(f"  Trends:   {df_trends.index.min()} to {df_trends.index.max()} ({len(df_trends)} weeks)")

print(f"\nColumn Counts:")
print(f"  Weather features: {len(df_weather.columns)}")
print(f"  Dengue targets: {len(df_dengue.columns)}")
print(f"  Search trends: {len(df_trends.columns)}")

print(f"\nWeather columns (first 10):")
print(df_weather.columns[:10].tolist())

print(f"\nDengue columns:")
print(df_dengue.columns.tolist())

print(f"\nSearch trends columns:")
print(df_trends.columns.tolist())

Date Range Comparison:
  Weather:  2016-01-01 00:00:00 to 2025-12-03 00:00:00 (526 weeks)
  Dengue:   2016-01-08 00:00:00 to 2022-12-24 00:00:00 (328 weeks)
  Trends:   2016-01-01 00:00:00 to 2022-11-26 00:00:00 (362 weeks)

Column Counts:
  Weather features: 40
  Dengue targets: 8
  Search trends: 4

Weather columns (first 10):
['Aklan_Humidity', 'Aklan_Rainfall', 'Aklan_Temp_Avg', 'Aklan_Temp_Max', 'Aklan_Temp_Min', 'Antique_Humidity', 'Antique_Rainfall', 'Antique_Temp_Avg', 'Antique_Temp_Max', 'Antique_Temp_Min']

Dengue columns:
['Aklan_Cases', 'Antique_Cases', 'Bacolod City_Cases', 'Capiz_Cases', 'Guimaras_Cases', 'Iloilo_Cases', 'Iloilo City_Cases', 'Negros Occidental_Cases']

Search trends columns:
['Dengue_Search', 'Dengue_Symptoms_Search', 'Dengue_Fever_Search_1', 'Dengue_Fever_Search_2']


## 3. Check for Location Name Mismatches

In [5]:
# Extract location names from column prefixes
weather_locations = sorted(set([col.rsplit('_', 1)[0] for col in df_weather.columns]))
dengue_locations = sorted(set([col.rsplit('_', 1)[0] for col in df_dengue.columns]))

print("Location names in weather data:")
print(weather_locations)

print("\nLocation names in dengue data:")
print(dengue_locations)

# Check for mismatches
if weather_locations != dengue_locations:
    print("\n‚ö†Ô∏è Location name mismatch detected!")
    print(f"   Weather only: {set(weather_locations) - set(dengue_locations)}")
    print(f"   Dengue only: {set(dengue_locations) - set(weather_locations)}")
    
    # Fix: Standardize dengue column names to match weather
    # Weather uses: Bacolodcity, Iloilocity, Negrosocc
    # Dengue uses: Bacolod City, Iloilo City, Negros Occidental
    
    rename_map = {}
    for col in df_dengue.columns:
        new_col = col.replace('Bacolod City', 'Bacolodcity')
        new_col = new_col.replace('Iloilo City', 'Iloilocity')
        new_col = new_col.replace('Negros Occidental', 'Negrosocc')
        if new_col != col:
            rename_map[col] = new_col
    
    if rename_map:
        print(f"\nüîß Renaming dengue columns:")
        for old, new in rename_map.items():
            print(f"   {old} ‚Üí {new}")
        df_dengue = df_dengue.rename(columns=rename_map)
        print("‚úÖ Dengue columns standardized")
else:
    print("\n‚úÖ Location names match perfectly!")

Location names in weather data:
['Aklan', 'Aklan_Temp', 'Antique', 'Antique_Temp', 'Bacolodcity', 'Bacolodcity_Temp', 'Capiz', 'Capiz_Temp', 'Guimaras', 'Guimaras_Temp', 'Iloilo', 'Iloilo_Temp', 'Iloilocity', 'Iloilocity_Temp', 'Negrosocc', 'Negrosocc_Temp']

Location names in dengue data:
['Aklan', 'Antique', 'Bacolod City', 'Capiz', 'Guimaras', 'Iloilo', 'Iloilo City', 'Negros Occidental']

‚ö†Ô∏è Location name mismatch detected!
   Weather only: {'Bacolodcity_Temp', 'Capiz_Temp', 'Bacolodcity', 'Negrosocc', 'Antique_Temp', 'Iloilocity_Temp', 'Aklan_Temp', 'Negrosocc_Temp', 'Iloilocity', 'Guimaras_Temp', 'Iloilo_Temp'}
   Dengue only: {'Bacolod City', 'Negros Occidental', 'Iloilo City'}

üîß Renaming dengue columns:
   Bacolod City_Cases ‚Üí Bacolodcity_Cases
   Iloilo City_Cases ‚Üí Iloilocity_Cases
   Negros Occidental_Cases ‚Üí Negrosocc_Cases
‚úÖ Dengue columns standardized


## 4. Merge Datasets

In [6]:
# Merge step 1: Weather + Dengue (inner join on Date)
df_merged = df_weather.join(df_dengue, how='inner')
print(f"After merging weather + dengue: {df_merged.shape}")
print(f"  Date range: {df_merged.index.min()} to {df_merged.index.max()}")
print(f"  Weeks retained: {len(df_merged)}")

# Merge step 2: Add Search Trends (inner join on Date)
df_merged = df_merged.join(df_trends, how='inner')
print(f"\nAfter merging + search trends: {df_merged.shape}")
print(f"  Date range: {df_merged.index.min()} to {df_merged.index.max()}")
print(f"  Weeks retained: {len(df_merged)}")

# Check for missing values
missing_count = df_merged.isnull().sum().sum()
print(f"\nMissing values: {missing_count}")
if missing_count > 0:
    print("Columns with missing values:")
    print(df_merged.isnull().sum()[df_merged.isnull().sum() > 0])

After merging weather + dengue: (328, 48)
  Date range: 2016-01-08 00:00:00 to 2022-12-24 00:00:00
  Weeks retained: 328

After merging + search trends: (326, 52)
  Date range: 2016-01-08 00:00:00 to 2022-10-01 00:00:00
  Weeks retained: 326

Missing values: 0


## 5. Organize Column Order

In [7]:
# Organize columns: [Weather features] ‚Üí [Search trends] ‚Üí [Dengue targets]
weather_cols = [col for col in df_merged.columns if any(x in col for x in ['Humidity', 'Rainfall', 'Temp'])]
trends_cols = [col for col in df_merged.columns if 'Search' in col]
dengue_cols = [col for col in df_merged.columns if 'Cases' in col]

# Reorder
df_merged = df_merged[weather_cols + trends_cols + dengue_cols]

print(f"Column organization:")
print(f"  Weather features: {len(weather_cols)}")
print(f"  Search trends: {len(trends_cols)}")
print(f"  Dengue targets: {len(dengue_cols)}")
print(f"  Total columns: {len(df_merged.columns)}")

print(f"\nFirst 5 weather columns: {weather_cols[:5]}")
print(f"Search trends columns: {trends_cols}")
print(f"Dengue target columns: {dengue_cols}")

Column organization:
  Weather features: 40
  Search trends: 4
  Dengue targets: 8
  Total columns: 52

First 5 weather columns: ['Aklan_Humidity', 'Aklan_Rainfall', 'Aklan_Temp_Avg', 'Aklan_Temp_Max', 'Aklan_Temp_Min']
Search trends columns: ['Dengue_Search', 'Dengue_Symptoms_Search', 'Dengue_Fever_Search_1', 'Dengue_Fever_Search_2']
Dengue target columns: ['Aklan_Cases', 'Antique_Cases', 'Bacolodcity_Cases', 'Capiz_Cases', 'Guimaras_Cases', 'Iloilo_Cases', 'Iloilocity_Cases', 'Negrosocc_Cases']


## 6. Data Quality Summary

In [8]:
print("="*70)
print("MERGED DATASET SUMMARY")
print("="*70)

print(f"\nüìä Shape: {df_merged.shape}")
print(f"   Rows (weeks): {len(df_merged)}")
print(f"   Columns (features + targets): {len(df_merged.columns)}")

print(f"\nüìÖ Date Range:")
print(f"   Start: {df_merged.index.min().strftime('%Y-%m-%d')}")
print(f"   End: {df_merged.index.max().strftime('%Y-%m-%d')}")
print(f"   Duration: {(df_merged.index.max() - df_merged.index.min()).days} days (~{len(df_merged)} weeks)")

print(f"\nüìà Dengue Cases Summary (all locations):")
dengue_summary = df_merged[dengue_cols].describe()
print(dengue_summary.loc[['count', 'mean', 'std', 'min', 'max']])

print(f"\nüå¶Ô∏è Weather Features Summary (first location - Aklan):")
aklan_weather = [col for col in weather_cols if col.startswith('Aklan')]
print(df_merged[aklan_weather].describe().loc[['mean', 'std', 'min', 'max']])

print(f"\nüîç Search Trends Summary:")
print(df_merged[trends_cols].describe().loc[['mean', 'std', 'min', 'max']])

# Check year distribution
print(f"\nüìÜ Records per year:")
year_counts = df_merged.index.year.value_counts().sort_index()
for year, count in year_counts.items():
    print(f"   {year}: {count} weeks")

print(f"\n‚úÖ No missing values: {df_merged.isnull().sum().sum() == 0}")

MERGED DATASET SUMMARY

üìä Shape: (326, 52)
   Rows (weeks): 326
   Columns (features + targets): 52

üìÖ Date Range:
   Start: 2016-01-08
   End: 2022-10-01
   Duration: 2458 days (~326 weeks)

üìà Dengue Cases Summary (all locations):
       Aklan_Cases  Antique_Cases  Bacolodcity_Cases  Capiz_Cases  \
count   326.000000     326.000000         326.000000   326.000000   
mean     34.861963      36.506135          26.923313    35.104294   
std      63.104280     124.017585          35.794417    74.133486   
min       0.000000       0.000000           0.000000     0.000000   
max     473.000000    2036.000000         187.000000   583.000000   

       Guimaras_Cases  Iloilo_Cases  Iloilocity_Cases  Negrosocc_Cases  
count      326.000000    326.000000        326.000000       326.000000  
mean        10.582822    119.950920         22.950920       102.760736  
std         30.855358    303.342534         43.643634       116.917030  
min          0.000000      0.000000          0.00000

## 7. Save Merged Dataset

In [9]:
# Save merged dataset
output_file = PROCESSED_DIR / 'merged_training_data.csv'
df_merged.to_csv(output_file, date_format='%Y-%m-%d')

print(f"\n‚úÖ SAVED: {output_file}")
print(f"   Shape: {df_merged.shape}")
print(f"   Size: {output_file.stat().st_size / 1024:.1f} KB")


‚úÖ SAVED: c:\Users\miray\Desktop\dengue_forecasting_project\data\processed\merged_training_data.csv
   Shape: (326, 52)
   Size: 104.8 KB
