In [11]:
# Import Required Libraries
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Environment setup complete.")

Environment setup complete.


## 1. Define Paths and Load Raw Data

In [12]:
# Define project paths
BASE_DIR = Path(r'c:\Users\miray\Desktop\dengue_forecasting_project')
RAW_DENGUE_DIR = BASE_DIR / 'data' / 'raw' / 'dengue'
PROCESSED_DIR = BASE_DIR / 'data' / 'processed'

# Ensure processed directory exists
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print(f"Base Directory: {BASE_DIR}")
print(f"Raw Dengue Directory: {RAW_DENGUE_DIR}")
print(f"Processed Data Directory: {PROCESSED_DIR}")

Base Directory: c:\Users\miray\Desktop\dengue_forecasting_project
Raw Dengue Directory: c:\Users\miray\Desktop\dengue_forecasting_project\data\raw\dengue
Processed Data Directory: c:\Users\miray\Desktop\dengue_forecasting_project\data\processed


In [13]:
# Load Google Trends dataset
trends_file = RAW_DENGUE_DIR / 'dengue_search_trends.csv'
df = pd.read_csv(trends_file)

print(f"\nRaw Data Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head(10))
print(f"\nLast few rows:")
print(df.tail(10))
print(f"\nData types:")
print(df.dtypes)


Raw Data Shape: (84, 5)
Columns: ['Month', 'dengue: (Philippines)', 'dengue symptoms: (Philippines)', 'Dengue fever: (Philippines)', 'dengue fever: (Philippines)']

First few rows:
     Month  dengue: (Philippines)  dengue symptoms: (Philippines)  \
0  2016-01                     17                               5   
1  2016-02                     15                               4   
2  2016-03                     11                               3   
3  2016-04                      8                               2   
4  2016-05                      6                               2   
5  2016-06                     11                               3   
6  2016-07                     19                               6   
7  2016-08                     30                               9   
8  2016-09                     25                               8   
9  2016-10                     19                               5   

   Dengue fever: (Philippines)  dengue fever: (Philippines

## 2. Clean and Parse Data

In [14]:
# Clean column names (remove country suffix and special characters)
df.columns = df.columns.str.replace(': (Philippines)', '', regex=False)
df.columns = df.columns.str.replace(':', '', regex=False)
df.columns = df.columns.str.strip()

# Rename columns for clarity
column_mapping = {
    'Month': 'Month',
    'dengue': 'Dengue_Search',
    'dengue symptoms': 'Dengue_Symptoms_Search',
    'Dengue fever': 'Dengue_Fever_Search_1',
    'dengue fever': 'Dengue_Fever_Search_2'
}
df = df.rename(columns=column_mapping)

# Parse month to datetime (set to first day of month)
df['Date'] = pd.to_datetime(df['Month'] + '-01', format='%Y-%m-%d')
df = df.drop(columns=['Month'])

# Sort by date
df = df.sort_values('Date').reset_index(drop=True)

print("Data after cleaning:")
print(f"  Shape: {df.shape}")
print(f"  Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f"  Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head(10))
print(f"\nSummary statistics:")
print(df.describe())

Data after cleaning:
  Shape: (84, 5)
  Date range: 2016-01-01 00:00:00 to 2022-12-01 00:00:00
  Columns: ['Dengue_Search', 'Dengue_Symptoms_Search', 'Dengue_Fever_Search_1', 'Dengue_Fever_Search_2', 'Date']

First few rows:
   Dengue_Search  Dengue_Symptoms_Search  Dengue_Fever_Search_1  \
0             17                       5                     17   
1             15                       4                     14   
2             11                       3                     11   
3              8                       2                      8   
4              6                       2                      6   
5             11                       3                     11   
6             19                       6                     19   
7             30                       9                     30   
8             25                       8                     26   
9             19                       5                     18   

   Dengue_Fever_Search_2       Date  

## 3. Interpolate Monthly Data to Weekly Frequency

In [15]:
# Set Date as index for resampling
df = df.set_index('Date')

# Resample to weekly frequency starting from the first date
# Use linear interpolation to fill weekly values between monthly data points
df_weekly = df.resample('W-FRI').interpolate(method='linear')

print("Weekly Interpolated Data:")
print(f"  Shape: {df_weekly.shape}")
print(f"  Date range: {df_weekly.index.min()} to {df_weekly.index.max()}")
print(f"  Total weeks: {len(df_weekly)}")
print(f"\nFirst 20 rows:")
print(df_weekly.head(20))
print(f"\nLast 20 rows:")
print(df_weekly.tail(20))

Weekly Interpolated Data:
  Shape: (362, 4)
  Date range: 2016-01-01 00:00:00 to 2022-12-02 00:00:00
  Total weeks: 362

First 20 rows:
            Dengue_Search  Dengue_Symptoms_Search  Dengue_Fever_Search_1  \
Date                                                                       
2016-01-01      17.000000                5.000000              17.000000   
2016-01-08      16.307692                4.769231              16.307692   
2016-01-15      15.615385                4.538462              15.615385   
2016-01-22      14.923077                4.307692              14.923077   
2016-01-29      14.230769                4.076923              14.230769   
2016-02-05      13.538462                3.846154              13.538462   
2016-02-12      12.846154                3.615385              12.846154   
2016-02-19      12.153846                3.384615              12.153846   
2016-02-26      11.461538                3.153846              11.461538   
2016-03-04      10.769231   

## 4. Align to Weather Week Starts (Jan 1, Jan 8, Jan 15, etc.)

In [16]:
# Load weather data to get exact week starts
weather_file = PROCESSED_DIR / 'weather_weekly_wide.csv'
weather_dates = pd.read_csv(weather_file, usecols=['Date'], parse_dates=['Date'])

print(f"Weather data has {len(weather_dates)} weeks")
print(f"Weather date range: {weather_dates['Date'].min()} to {weather_dates['Date'].max()}")

# Reset index to make Date a column
df_weekly = df_weekly.reset_index()

# Calculate weather-aligned week start for each trends date
df_weekly['year'] = df_weekly['Date'].dt.year
df_weekly['day_of_year'] = df_weekly['Date'].dt.dayofyear
df_weekly['weather_week_num'] = ((df_weekly['day_of_year'] - 1) // 7)
df_weekly['weather_week_start'] = pd.to_datetime(df_weekly['year'].astype(str) + '-01-01') + pd.to_timedelta(df_weekly['weather_week_num'] * 7, unit='d')

# Group by weather_week_start and take mean (in case multiple trend weeks map to same weather week)
search_cols = ['Dengue_Search', 'Dengue_Symptoms_Search', 'Dengue_Fever_Search_1', 'Dengue_Fever_Search_2']
df_aligned = df_weekly.groupby('weather_week_start')[search_cols].mean().reset_index()
df_aligned = df_aligned.rename(columns={'weather_week_start': 'Date'})

print(f"\nAligned to weather weeks:")
print(f"  Shape: {df_aligned.shape}")
print(f"  Date range: {df_aligned['Date'].min()} to {df_aligned['Date'].max()}")
print(f"\nFirst 20 rows:")
print(df_aligned.head(20))
print(f"\nLast 20 rows:")
print(df_aligned.tail(20))

Weather data has 526 weeks
Weather date range: 2016-01-01 00:00:00 to 2025-12-03 00:00:00

Aligned to weather weeks:
  Shape: (362, 5)
  Date range: 2016-01-01 00:00:00 to 2022-11-26 00:00:00

First 20 rows:
         Date  Dengue_Search  Dengue_Symptoms_Search  Dengue_Fever_Search_1  \
0  2016-01-01      17.000000                5.000000              17.000000   
1  2016-01-08      16.307692                4.769231              16.307692   
2  2016-01-15      15.615385                4.538462              15.615385   
3  2016-01-22      14.923077                4.307692              14.923077   
4  2016-01-29      14.230769                4.076923              14.230769   
5  2016-02-05      13.538462                3.846154              13.538462   
6  2016-02-12      12.846154                3.615385              12.846154   
7  2016-02-19      12.153846                3.384615              12.153846   
8  2016-02-26      11.461538                3.153846              11.461538   
9 

## 5. Check Data Quality and Coverage

In [17]:
# Check overlap with weather/dengue data
print("Data Quality Check:")
print(f"  Search trends weeks: {len(df_aligned)}")
print(f"  Weather data weeks: {len(weather_dates)}")
print(f"\nDate alignment:")
print(f"  Search trends: {df_aligned['Date'].min()} to {df_aligned['Date'].max()}")
print(f"  Weather data: {weather_dates['Date'].min()} to {weather_dates['Date'].max()}")

# Merge with weather dates to see overlap
merged = pd.merge(weather_dates, df_aligned, on='Date', how='outer', indicator=True)
print(f"\nOverlap analysis:")
print(f"  Weeks in both datasets: {(merged['_merge'] == 'both').sum()}")
print(f"  Weeks only in weather: {(merged['_merge'] == 'left_only').sum()}")
print(f"  Weeks only in trends: {(merged['_merge'] == 'right_only').sum()}")

# Keep only weeks that overlap with weather data
df_final = merged[merged['_merge'] == 'both'].drop(columns=['_merge'])
print(f"\nFinal dataset (overlapping weeks only):")
print(f"  Shape: {df_final.shape}")
print(f"  Date range: {df_final['Date'].min()} to {df_final['Date'].max()}")

Data Quality Check:
  Search trends weeks: 362
  Weather data weeks: 526

Date alignment:
  Search trends: 2016-01-01 00:00:00 to 2022-11-26 00:00:00
  Weather data: 2016-01-01 00:00:00 to 2025-12-03 00:00:00

Overlap analysis:
  Weeks in both datasets: 362
  Weeks only in weather: 164
  Weeks only in trends: 0

Final dataset (overlapping weeks only):
  Shape: (362, 5)
  Date range: 2016-01-01 00:00:00 to 2022-11-26 00:00:00

Overlap analysis:
  Weeks in both datasets: 362
  Weeks only in weather: 164
  Weeks only in trends: 0

Final dataset (overlapping weeks only):
  Shape: (362, 5)
  Date range: 2016-01-01 00:00:00 to 2022-11-26 00:00:00


## 6. Save Processed Data

In [18]:
# Set Date as index for consistency with other datasets
df_final = df_final.set_index('Date')

# Save weekly search trends
output_file = PROCESSED_DIR / 'search_trends_weekly_wide.csv'
df_final.to_csv(output_file, date_format='%Y-%m-%d')
print(f"✅ Saved: {output_file}")
print(f"   Shape: {df_final.shape}")
print(f"   Format: Wide (4 search term columns)")
print(f"   Date format: YYYY-MM-DD (matches weather/dengue data)")
print(f"   Columns: {df_final.columns.tolist()}")

✅ Saved: c:\Users\miray\Desktop\dengue_forecasting_project\data\processed\search_trends_weekly_wide.csv
   Shape: (362, 4)
   Format: Wide (4 search term columns)
   Date format: YYYY-MM-DD (matches weather/dengue data)
   Columns: ['Dengue_Search', 'Dengue_Symptoms_Search', 'Dengue_Fever_Search_1', 'Dengue_Fever_Search_2']
