In [21]:
# Import Required Libraries
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Environment setup complete.")

Environment setup complete.


## 1. Define Paths and Load Raw Data

In [22]:
# Define project paths
BASE_DIR = Path(r'c:\Users\miray\Desktop\dengue_forecasting_project')
RAW_DENGUE_DIR = BASE_DIR / 'data' / 'raw' / 'dengue'
PROCESSED_DIR = BASE_DIR / 'data' / 'processed'

# Ensure processed directory exists
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print(f"Base Directory: {BASE_DIR}")
print(f"Raw Dengue Directory: {RAW_DENGUE_DIR}")
print(f"Processed Data Directory: {PROCESSED_DIR}")

Base Directory: c:\Users\miray\Desktop\dengue_forecasting_project
Raw Dengue Directory: c:\Users\miray\Desktop\dengue_forecasting_project\data\raw\dengue
Processed Data Directory: c:\Users\miray\Desktop\dengue_forecasting_project\data\processed


In [23]:
# Load dengue dataset
dengue_file = RAW_DENGUE_DIR / 'dengue_dataset.csv'
df = pd.read_csv(dengue_file)

print(f"\nRaw Data Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head(10))
print(f"\nData types:")
print(df.dtypes)


Raw Data Shape: (2624, 6)
Columns: ['loc', 'cases', 'deaths', 'date', 'region', 'year']

First few rows:
                 loc  cases  deaths        date                     region  \
0              AKLAN     50     0.0  10/01/2016  REGION VI-WESTERN VISAYAS   
1            ANTIQUE     17     0.0  10/01/2016  REGION VI-WESTERN VISAYAS   
2       BACOLOD CITY     13     0.0  10/01/2016  REGION VI-WESTERN VISAYAS   
3              CAPIZ     43     0.0  10/01/2016  REGION VI-WESTERN VISAYAS   
4           GUIMARAS      1     0.0  10/01/2016  REGION VI-WESTERN VISAYAS   
5             ILOILO     41     1.0  10/01/2016  REGION VI-WESTERN VISAYAS   
6        ILOILO CITY     19     0.0  10/01/2016  REGION VI-WESTERN VISAYAS   
7  NEGROS OCCIDENTAL     99     0.0  10/01/2016  REGION VI-WESTERN VISAYAS   
8              AKLAN     32     0.0  17/01/2016  REGION VI-WESTERN VISAYAS   
9            ANTIQUE     13     0.0  17/01/2016  REGION VI-WESTERN VISAYAS   

   year  
0  2016  
1  2016  
2  20

## 2. Clean and Normalize Data

In [24]:
# Rename columns to consistent format
df = df.rename(columns={
    'loc': 'Location',
    'cases': 'Cases',
    'deaths': 'Deaths',
    'date': 'Date',
    'region': 'Region',
    'year': 'Year'
})

# Parse dates (format: dd/mm/yyyy)
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# Normalize location names to title case and remove extra spaces
df['Location'] = df['Location'].str.strip().str.title()

# Sort by date and location
df = df.sort_values(['Date', 'Location']).reset_index(drop=True)

print("Data after cleaning:")
print(f"  Shape: {df.shape}")
print(f"  Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f"  Unique locations: {df['Location'].unique()}")
print(f"\nFirst few rows:")
print(df.head(10))

Data after cleaning:
  Shape: (2624, 6)
  Date range: 2016-01-10 00:00:00 to 2022-12-25 00:00:00
  Unique locations: ['Aklan' 'Antique' 'Bacolod City' 'Capiz' 'Guimaras' 'Iloilo'
 'Iloilo City' 'Negros Occidental']

First few rows:
            Location  Cases  Deaths       Date                     Region  \
0              Aklan     50     0.0 2016-01-10  REGION VI-WESTERN VISAYAS   
1            Antique     17     0.0 2016-01-10  REGION VI-WESTERN VISAYAS   
2       Bacolod City     13     0.0 2016-01-10  REGION VI-WESTERN VISAYAS   
3              Capiz     43     0.0 2016-01-10  REGION VI-WESTERN VISAYAS   
4           Guimaras      1     0.0 2016-01-10  REGION VI-WESTERN VISAYAS   
5             Iloilo     41     1.0 2016-01-10  REGION VI-WESTERN VISAYAS   
6        Iloilo City     19     0.0 2016-01-10  REGION VI-WESTERN VISAYAS   
7  Negros Occidental     99     0.0 2016-01-10  REGION VI-WESTERN VISAYAS   
8              Aklan     32     0.0 2016-01-17  REGION VI-WESTERN VISAYAS  

## 3. Align Dengue Weeks to Weather Weeks

Dengue data uses MMWR/epidemiological weeks (Sunday start: Jan 10, Jan 17, Jan 24...)
Weather data uses 7-day periods starting Jan 1 (Friday start: Jan 1, Jan 8, Jan 15, Jan 22...)

We'll map dengue dates to the **nearest preceding** weather week start date.
This is standard practice in epidemiological research when merging weekly data from different sources.

In [25]:
# Calculate epidemiological week alignment

df['year'] = df['Date'].dt.year
df['week_of_year'] = df['Date'].dt.isocalendar().week

# Calculate the weather-aligned week start (Jan 1 + floor((day_of_year - 1) / 7) * 7)
df['day_of_year'] = df['Date'].dt.dayofyear
df['weather_week_num'] = ((df['day_of_year'] - 1) // 7)
df['weather_week_start'] = pd.to_datetime(df['year'].astype(str) + '-01-01') + pd.to_timedelta(df['weather_week_num'] * 7, unit='d')

print("Date alignment mapping:")
print("Dengue Date → Weather Week Start")
sample = df[['Date', 'weather_week_start', 'Location', 'Cases']].drop_duplicates(subset=['Date']).head(20)
for _, row in sample.iterrows():
    print(f"  {row['Date'].strftime('%Y-%m-%d (%A)')} → {row['weather_week_start'].strftime('%Y-%m-%d (%A)')}")

# Check the offset
df['day_offset'] = (df['Date'] - df['weather_week_start']).dt.days
print(f"\nDay offset statistics (dengue date - weather week start):")
print(df['day_offset'].value_counts().sort_index())
print(f"Most common offset: {df['day_offset'].mode()[0]} days")

# Drop temporary columns
df = df.drop(columns=['year', 'week_of_year', 'day_of_year', 'weather_week_num', 'day_offset'])

Date alignment mapping:
Dengue Date → Weather Week Start
  2016-01-10 (Sunday) → 2016-01-08 (Friday)
  2016-01-17 (Sunday) → 2016-01-15 (Friday)
  2016-01-24 (Sunday) → 2016-01-22 (Friday)
  2016-01-31 (Sunday) → 2016-01-29 (Friday)
  2016-02-07 (Sunday) → 2016-02-05 (Friday)
  2016-02-14 (Sunday) → 2016-02-12 (Friday)
  2016-02-21 (Sunday) → 2016-02-19 (Friday)
  2016-02-28 (Sunday) → 2016-02-26 (Friday)
  2016-03-06 (Sunday) → 2016-03-04 (Friday)
  2016-03-13 (Sunday) → 2016-03-11 (Friday)
  2016-03-20 (Sunday) → 2016-03-18 (Friday)
  2016-03-27 (Sunday) → 2016-03-25 (Friday)
  2016-04-03 (Sunday) → 2016-04-01 (Friday)
  2016-04-10 (Sunday) → 2016-04-08 (Friday)
  2016-04-17 (Sunday) → 2016-04-15 (Friday)
  2016-04-24 (Sunday) → 2016-04-22 (Friday)
  2016-05-01 (Sunday) → 2016-04-29 (Friday)
  2016-05-08 (Sunday) → 2016-05-06 (Friday)
  2016-05-15 (Sunday) → 2016-05-13 (Friday)
  2016-05-22 (Sunday) → 2016-05-20 (Friday)

Day offset statistics (dengue date - weather week start):
day_

## 4. Aggregate to Weather-Aligned Weekly Format

In [26]:
# Group by weather_week_start and Location, sum cases and deaths
# This aligns dengue surveillance weeks with weather observation weeks
df_weekly = df.groupby(['weather_week_start', 'Location']).agg({
    'Cases': 'sum',
    'Deaths': 'sum'
}).reset_index()

# Rename weather_week_start to Date for consistency
df_weekly = df_weekly.rename(columns={'weather_week_start': 'Date'})

# Sort by date and location
df_weekly = df_weekly.sort_values(['Date', 'Location']).reset_index(drop=True)

print("Weather-Aligned Weekly Data (Long Format):")
print(f"  Shape: {df_weekly.shape}")
print(f"  Date range: {df_weekly['Date'].min()} to {df_weekly['Date'].max()}")
print(f"  Total weeks: {df_weekly['Date'].nunique()}")
print(f"  Locations: {sorted(df_weekly['Location'].unique())}")
print(f"\nSummary statistics:")
print(df_weekly.groupby('Location')['Cases'].describe())
print(f"\nFirst 20 rows:")
print(df_weekly.head(20))
print(f"\nLast 20 rows:")
print(df_weekly.tail(20))

Weather-Aligned Weekly Data (Long Format):
  Shape: (2624, 4)
  Date range: 2016-01-08 00:00:00 to 2022-12-24 00:00:00
  Total weeks: 328
  Locations: ['Aklan', 'Antique', 'Bacolod City', 'Capiz', 'Guimaras', 'Iloilo', 'Iloilo City', 'Negros Occidental']

Summary statistics:
                   count        mean         std  min   25%   50%     75%  \
Location                                                                    
Aklan              328.0   34.692073   62.950879  0.0   6.0  18.5   40.00   
Antique            328.0   36.314024  123.662170  0.0   5.0  12.5   29.25   
Bacolod City       328.0   26.807927   35.720753  0.0   5.0  12.0   35.00   
Capiz              328.0   34.929878   73.940082  0.0   4.0  12.5   33.00   
Guimaras           328.0   10.524390   30.769927  0.0   0.0   2.0    7.00   
Iloilo             328.0  119.259146  302.542857  0.0   9.0  27.5   79.00   
Iloilo City        328.0   22.817073   43.543680  0.0   3.0   9.0   20.00   
Negros Occidental  328.0  102.3

## 5. Data Quality Check

In [27]:
# Use df_weekly directly - DO NOT fill missing weeks with zeros
# The dengue surveillance data only has actual reporting weeks
df_complete = df_weekly.copy()

print(f"Final dataset (actual surveillance weeks only):")
print(f"  Shape: {df_complete.shape}")
print(f"  Date range: {df_complete['Date'].min()} to {df_complete['Date'].max()}")
print(f"  Total unique weeks: {df_complete['Date'].nunique()}")
print(f"  Locations: {sorted(df_complete['Location'].unique())}")
print(f"\nData quality check:")
print(f"  Zero cases count: {(df_complete['Cases'] == 0).sum()}")
print(f"  Non-zero cases count: {(df_complete['Cases'] > 0).sum()}")
print(f"  Missing values: {df_complete.isnull().sum().sum()}")

# Check year distribution
print(f"\nRecords per year:")
year_counts = df_complete['Date'].dt.year.value_counts().sort_index()
for year, count in year_counts.items():
    print(f"  {year}: {count} records")

Final dataset (actual surveillance weeks only):
  Shape: (2624, 4)
  Date range: 2016-01-08 00:00:00 to 2022-12-24 00:00:00
  Total unique weeks: 328
  Locations: ['Aklan', 'Antique', 'Bacolod City', 'Capiz', 'Guimaras', 'Iloilo', 'Iloilo City', 'Negros Occidental']

Data quality check:
  Zero cases count: 232
  Non-zero cases count: 2392
  Missing values: 0

Records per year:
  2016: 408 records
  2017: 416 records
  2018: 416 records
  2019: 408 records
  2020: 416 records
  2021: 336 records
  2022: 224 records


## 6. Pivot to Wide Format (Multi-Output LSTM Target)

In [28]:
# Pivot Cases to wide format (8 columns, one per location)
dengue_cases_wide = df_complete.pivot(index='Date', columns='Location', values='Cases')

# Sort columns alphabetically for consistency
dengue_cases_wide = dengue_cases_wide[sorted(dengue_cases_wide.columns)]

# Rename columns to include _Cases suffix for clarity
dengue_cases_wide.columns = [f"{col}_Cases" for col in dengue_cases_wide.columns]

print("Wide Format Dengue Cases (Multi-Output LSTM Targets):")
print(f"  Shape: {dengue_cases_wide.shape}")
print(f"  Date range: {dengue_cases_wide.index.min()} to {dengue_cases_wide.index.max()}")
print(f"  Columns (8 locations): {dengue_cases_wide.columns.tolist()}")
print(f"\nSummary statistics:")
print(dengue_cases_wide.describe())
print(f"\nFirst few rows:")
print(dengue_cases_wide.head(10))
print(f"\nLast few rows:")
print(dengue_cases_wide.tail(10))

Wide Format Dengue Cases (Multi-Output LSTM Targets):
  Shape: (328, 8)
  Date range: 2016-01-08 00:00:00 to 2022-12-24 00:00:00
  Columns (8 locations): ['Aklan_Cases', 'Antique_Cases', 'Bacolod City_Cases', 'Capiz_Cases', 'Guimaras_Cases', 'Iloilo_Cases', 'Iloilo City_Cases', 'Negros Occidental_Cases']

Summary statistics:
       Aklan_Cases  Antique_Cases  Bacolod City_Cases  Capiz_Cases  \
count   328.000000     328.000000          328.000000   328.000000   
mean     34.692073      36.314024           26.807927    34.929878   
std      62.950879     123.662170           35.720753    73.940082   
min       0.000000       0.000000            0.000000     0.000000   
25%       6.000000       5.000000            5.000000     4.000000   
50%      18.500000      12.500000           12.000000    12.500000   
75%      40.000000      29.250000           35.000000    33.000000   
max     473.000000    2036.000000          187.000000   583.000000   

       Guimaras_Cases  Iloilo_Cases  Iloil

## 7. Optional: Pivot Deaths to Wide Format

In [29]:
# Pivot Deaths to wide format
dengue_deaths_wide = df_complete.pivot(index='Date', columns='Location', values='Deaths')
dengue_deaths_wide = dengue_deaths_wide[sorted(dengue_deaths_wide.columns)]
dengue_deaths_wide.columns = [f"{col}_Deaths" for col in dengue_deaths_wide.columns]

print("Wide Format Dengue Deaths:")
print(f"  Shape: {dengue_deaths_wide.shape}")
print(f"  Total deaths (all locations, all time): {dengue_deaths_wide.sum().sum():.0f}")
print(f"\nFirst few rows:")
print(dengue_deaths_wide.head(10))

Wide Format Dengue Deaths:
  Shape: (328, 8)
  Total deaths (all locations, all time): 1912

First few rows:
            Aklan_Deaths  Antique_Deaths  Bacolod City_Deaths  Capiz_Deaths  \
Date                                                                          
2016-01-08           0.0             0.0                  0.0           0.0   
2016-01-15           0.0             0.0                  0.0           0.0   
2016-01-22           0.0             0.0                  0.0           0.0   
2016-01-29           0.0             0.0                  0.0           0.0   
2016-02-05           0.0             0.0                  0.0           0.0   
2016-02-12           0.0             0.0                  0.0           0.0   
2016-02-19           0.0             0.0                  0.0           0.0   
2016-02-26           0.0             0.0                  0.0           0.0   
2016-03-04           0.0             0.0                  0.0           0.0   
2016-03-11           0

## 8. Save Processed Data

In [31]:
# 1. Save long format (for EDA and visualizations)
output_long = PROCESSED_DIR / 'dengue_weekly_long.csv'
df_complete.to_csv(output_long, index=False)
print(f"✅ Saved: {output_long}")
print(f"   Shape: {df_complete.shape}")
print(f"   Format: Long (Date, Location, Cases, Deaths)")

# 2. Save wide format Cases (LSTM target variables)
output_wide_cases = PROCESSED_DIR / 'dengue_weekly_wide.csv'
dengue_cases_wide.to_csv(output_wide_cases)
print(f"\n✅ Saved: {output_wide_cases}")
print(f"   Shape: {dengue_cases_wide.shape}")
print(f"   Format: Wide (8 location columns)")
print(f"   Purpose: Multi-Output LSTM target (Y)")

# 3. Optional: Save wide format Deaths
output_wide_deaths = PROCESSED_DIR / 'dengue_deaths_weekly_wide.csv'
dengue_deaths_wide.to_csv(output_wide_deaths)
print(f"\n✅ Saved: {output_wide_deaths}")
print(f"   Shape: {dengue_deaths_wide.shape}")
print(f"   Format: Wide (8 location columns for deaths)")

✅ Saved: c:\Users\miray\Desktop\dengue_forecasting_project\data\processed\dengue_weekly_long.csv
   Shape: (2624, 4)
   Format: Long (Date, Location, Cases, Deaths)

✅ Saved: c:\Users\miray\Desktop\dengue_forecasting_project\data\processed\dengue_weekly_wide.csv
   Shape: (328, 8)
   Format: Wide (8 location columns)
   Purpose: Multi-Output LSTM target (Y)

✅ Saved: c:\Users\miray\Desktop\dengue_forecasting_project\data\processed\dengue_deaths_weekly_wide.csv
   Shape: (328, 8)
   Format: Wide (8 location columns for deaths)
