In [13]:
from pathlib import Path
print("Current folder:", Path.cwd())
print("\nFiles in ../data/raw:")
print(list(Path("../data/raw").glob("*")))
print("\nFiles in ../data/processed:")
print(list(Path("../data/processed").glob("*")))

Current folder: C:\Root\Tools\Microsoft VS Code

Files in ../data/raw:
[]

Files in ../data/processed:
[]


In [None]:
# 00_data_preprocessing.ipynb
import pandas as pd
import numpy as np
from pathlib import Path

# Load raw data
df = pd.read_csv("../data/raw/summary_of_weather.csv")

# Quick inspection
print("Shape:", df.shape)
print("\nColumns:")
print(df.columns.tolist())

print("\nFirst 5 rows:")
print(df.head())

print("\nInfo:")
df.info()

print("Missing values % (sorted):")
missing_percent = (df.isna().sum() / len(df)) * 100
print(missing_percent.sort_values(ascending=False).round(2))

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/summary_of_weather.csv'

In [None]:
# ==================== 1. Same-day cleaned dataset ====================

# Loading data with no low_memory option (no chunking)
df = pd.read_csv("../data/raw/Summary of Weather.csv", low_memory=False)

# Keep only useful columns
same_day = df[['Date', 'STA', 'MaxTemp', 'MinTemp', 'MeanTemp', 
               'MAX', 'MIN', 'MEA', 'PRCP', 'MO']].copy()

# Convert Date to proper datetime and extract Month 
same_day['Month'] = same_day['Date'].dt.month
same_day = same_day.drop(columns=['MO'])

# Clean PRCP
same_day['PRCP'] = same_day['PRCP'].replace('T', '0.01')
same_day['PRCP'] = pd.to_numeric(same_day['PRCP'], errors='coerce')

# Drop rows missing target
same_day = same_day.dropna(subset=['MaxTemp', 'MinTemp', 'MeanTemp'])

# Quick inspection â€” YOU run these 3 lines and paste the output
print("Final shape:", same_day.shape)
print("\nMissing values %:")
print((same_day.isna().mean() * 100).round(2))
print("\nFirst 5 rows of our clean data:")
print(same_day.head())

same_day.to_parquet("../data/processed/01_cleaned_same_day.parquet", index=False)
print("Saved: 01_cleaned_same_day.parquet")

In [None]:
# ==================== 2. Next-day forecasting dataset ====================
next_day = df[['Date', 'STA', 'MaxTemp', 'MinTemp', 'MeanTemp', 'PRCP']].copy()

# Clean PRCP
next_day['PRCP'] = next_day['PRCP'].replace('T', '0.01')
next_day['PRCP'] = pd.to_numeric(next_day['PRCP'], errors='coerce')
next_day['PRCP'] = next_day['PRCP'].fillna(0)

next_day = next_day.sort_values(['STA', 'Date'])

# Target = tomorrow
next_day['MaxTemp_tomorrow'] = next_day.groupby('STA')['MaxTemp'].shift(-1)
next_day = next_day.dropna(subset=['MaxTemp_tomorrow'])

# Lag features (yesterday)
for col in ['MaxTemp', 'MinTemp', 'MeanTemp', 'PRCP']:
    next_day[f'{col}_yesterday'] = next_day.groupby('STA')[col].shift(1)

# Calendar features
next_day['Month'] = next_day['Date'].dt.month
next_day['DayOfYear'] = next_day['Date'].dt.dayofyear
next_day['IsWeekend'] = next_day['Date'].dt.weekday >= 5

# Drop rows without yesterday
next_day = next_day.dropna(subset=['MaxTemp_yesterday'])

next_day.to_parquet("data/processed/02_next_day_features.parquet", index=False)
print("Saved: 02_next_day_features.parquet")
print(f"Final forecasting dataset: {next_day.shape[0]:,} rows")