In [1]:
# 00_data_preprocessing.ipynb
import pandas as pd
import numpy as np

# Load raw data
df = pd.read_csv("../data/raw/summary_of_weather.csv")

# Quick inspection
print("Shape:", df.shape)
print("\nColumns:")
print(df.columns.tolist())

print("\nFirst 5 rows:")
print(df.head())

print("\nInfo:")
df.info()

print("Missing values % (sorted):")
missing_percent = (df.isna().sum() / len(df)) * 100
print(missing_percent.sort_values(ascending=False).round(2))

Shape: (119040, 31)

Columns:
['STA', 'Date', 'Precip', 'WindGustSpd', 'MaxTemp', 'MinTemp', 'MeanTemp', 'Snowfall', 'PoorWeather', 'YR', 'MO', 'DA', 'PRCP', 'DR', 'SPD', 'MAX', 'MIN', 'MEA', 'SNF', 'SND', 'FT', 'FB', 'FTI', 'ITH', 'PGT', 'TSHDSBRSGF', 'SD3', 'RHX', 'RHN', 'RVG', 'WTE']

First 5 rows:
     STA      Date Precip  WindGustSpd    MaxTemp    MinTemp   MeanTemp  \
0  10001  1942-7-1  1.016          NaN  25.555556  22.222222  23.888889   
1  10001  1942-7-2      0          NaN  28.888889  21.666667  25.555556   
2  10001  1942-7-3   2.54          NaN  26.111111  22.222222  24.444444   
3  10001  1942-7-4   2.54          NaN  26.666667  22.222222  24.444444   
4  10001  1942-7-5      0          NaN  26.666667  21.666667  24.444444   

  Snowfall PoorWeather  YR  ...  FB  FTI ITH  PGT  TSHDSBRSGF  SD3  RHX  RHN  \
0      0.0         NaN  42  ... NaN  NaN NaN  NaN         NaN  NaN  NaN  NaN   
1      0.0         NaN  42  ... NaN  NaN NaN  NaN         NaN  NaN  NaN  NaN   
2     

  df = pd.read_csv("../data/raw/summary_of_weather.csv")


In [16]:
# Loading data with no low_memory option (no chunking)
df = pd.read_csv("../data/raw/summary_of_weather.csv", low_memory=False)

# Convert Date to proper datetime and extract Month 
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month

# Drop useless columns
df = df.drop(columns=['MO', 'SD3', 'RHN', 'RHX', 'RVG', 'FT', 'FTI', 'WTE', 'ITH', 'FB', 'PGT', 'WindGustSpd', 'SPD', 'DR', 'SND'])

# Clean PRCP
df['PRCP'] = df['PRCP'].replace('T', '0.01')
df['PRCP'] = pd.to_numeric(df['PRCP'], errors='coerce')
df['PRCP'] = df['PRCP'].fillna(0)

# Fill the missing values in °F columns with their °C counterparts
df['MAX'] = df['MAX'].fillna((df['MaxTemp']*1.8)+32)
df['MIN'] = df['MIN'].fillna((df['MinTemp']*1.8)+32)
df['MEA'] = df['MEA'].fillna((df['MeanTemp']*1.8)+32)

df.to_csv("../data/processed/summary_of_weather_processed.csv", index=False)
print("Data processed and saved to ../data/processed/summary_of_weather_processed.csv")


Data processed and saved to ../data/processed/summary_of_weather_processed.csv


In [18]:
# ==================== 1. Same-day cleaned dataset ====================

# Read processed data
df = pd.read_csv("../data/processed/summary_of_weather_processed.csv", low_memory=False)

# Keep only useful columns
same_day = df[['STA', 'MaxTemp', 'MinTemp', 'MeanTemp', 'PRCP', 'Month']].copy()

# Quick inspection
print("Final shape:", same_day.shape)
print("\nMissing values %:")
print((same_day.isna().mean() * 100).round(2))
print("\nFirst 5 rows of our clean data:")
print(same_day.head())

same_day.to_csv("../data/processed/same_day_maxtemp.csv", index=False)

Final shape: (119040, 6)

Missing values %:
STA         0.0
MaxTemp     0.0
MinTemp     0.0
MeanTemp    0.0
PRCP        0.0
Month       0.0
dtype: float64

First 5 rows of our clean data:
     STA    MaxTemp    MinTemp   MeanTemp  PRCP  Month
0  10001  25.555556  22.222222  23.888889  0.04      7
1  10001  28.888889  21.666667  25.555556  0.00      7
2  10001  26.111111  22.222222  24.444444  0.10      7
3  10001  26.666667  22.222222  24.444444  0.10      7
4  10001  26.666667  21.666667  24.444444  0.00      7


In [None]:
# ==================== 2. Next-day forecasting dataset ====================
"""
# Read processed data
df = pd.read_csv("../data/processed/summary_of_weather_processed.csv")

# Keep only useful columns
next_day = df[['Date', 'STA', 'MaxTemp', 'MinTemp', 'MeanTemp', 'PRCP']].copy()

next_day = next_day.sort_values(['STA', 'Date'])

# Target = tomorrow
next_day['MaxTemp_tomorrow'] = next_day.groupby('STA')['MaxTemp'].shift(-1)
next_day = next_day.dropna(subset=['MaxTemp_tomorrow'])

# Lag features (yesterday)
for col in ['MaxTemp', 'MinTemp', 'MeanTemp', 'PRCP']:
    next_day[f'{col}_yesterday'] = next_day.groupby('STA')[col].shift(1)

# Calendar features
next_day['Month'] = next_day['Date'].dt.month
next_day['DayOfYear'] = next_day['Date'].dt.dayofyear
next_day['IsWeekend'] = next_day['Date'].dt.weekday >= 5

# Drop rows without yesterday
next_day = next_day.dropna(subset=['MaxTemp_yesterday'])

next_day.to_parquet("data/processed/02_next_day_features.parquet", index=False)
print("Saved: 02_next_day_features.parquet")
print(f"Final forecasting dataset: {next_day.shape[0]:,} rows")
"""