Column inspection

In [1]:
# 00_data_preprocessing.ipynb
import pandas as pd
import numpy as np

# Load raw data
df = pd.read_csv("../data/raw/summary_of_weather.csv")

# Quick inspection
print("Shape:", df.shape)
print("\nColumns:")
print(df.columns.tolist())

print("\nFirst 5 rows:")
print(df.head())

print("\nInfo:")
df.info()

print("Missing values % (sorted):")
missing_percent = (df.isna().sum() / len(df)) * 100
print(missing_percent.sort_values(ascending=False).round(2))

missing_count = (df.isna().sum())
print(missing_count.sort_values(ascending=False))

Shape: (119040, 31)

Columns:
['STA', 'Date', 'Precip', 'WindGustSpd', 'MaxTemp', 'MinTemp', 'MeanTemp', 'Snowfall', 'PoorWeather', 'YR', 'MO', 'DA', 'PRCP', 'DR', 'SPD', 'MAX', 'MIN', 'MEA', 'SNF', 'SND', 'FT', 'FB', 'FTI', 'ITH', 'PGT', 'TSHDSBRSGF', 'SD3', 'RHX', 'RHN', 'RVG', 'WTE']

First 5 rows:
     STA      Date Precip  WindGustSpd    MaxTemp    MinTemp   MeanTemp  \
0  10001  1942-7-1  1.016          NaN  25.555556  22.222222  23.888889   
1  10001  1942-7-2      0          NaN  28.888889  21.666667  25.555556   
2  10001  1942-7-3   2.54          NaN  26.111111  22.222222  24.444444   
3  10001  1942-7-4   2.54          NaN  26.666667  22.222222  24.444444   
4  10001  1942-7-5      0          NaN  26.666667  21.666667  24.444444   

  Snowfall PoorWeather  YR  ...  FB  FTI ITH  PGT  TSHDSBRSGF  SD3  RHX  RHN  \
0      0.0         NaN  42  ... NaN  NaN NaN  NaN         NaN  NaN  NaN  NaN   
1      0.0         NaN  42  ... NaN  NaN NaN  NaN         NaN  NaN  NaN  NaN   
2     

  df = pd.read_csv("../data/raw/summary_of_weather.csv")


Dropping useless columns


In [2]:
# Loading data with no low_memory option (no chunking)
df = pd.read_csv("../data/raw/summary_of_weather.csv", low_memory=False)

# Convert Date to proper datetime
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)

# Drop useless columns
df = df.drop(columns=['MO', 'SD3', 'RHN', 'RHX', 'RVG', 'FT', 'FTI', 'WTE', 'ITH', 'FB', 'PGT','TSHDSBRSGF', 'SNF', 'WindGustSpd', 'SPD', 'DR', 'SND', 'MAX', 'MIN', 'MEA', 'PRCP'])

Extracting temporal features from the date column


In [3]:
df['month'] = df['Date'].dt.month
df['dayofyear'] = df['Date'].dt.dayofyear
df['is_winter'] = df['month'].isin([12, 1, 2]).astype(int)
df['is_summer'] = df['month'].isin([6, 7, 8]).astype(int)

Station encoding

In [4]:
# Target encoding: station mean MaxTemp
station_means = df.groupby('STA')['MaxTemp'].mean()
df['station_mean_maxtemp'] = df['STA'].map(station_means)

# Station count (proxy for data reliability)
station_counts = df.groupby('STA').size()
df['station_count'] = df['STA'].map(station_counts)

print(f"   Stations in data: {df['STA'].nunique()}")

   Stations in data: 159


Cleaning Precip

In [5]:
df['Precip'] = df['Precip'].replace({'T': 0.001})
df['Precip'] = pd.to_numeric(df['Precip'], errors='coerce')
df['Precip'] = df['Precip'].fillna(0)

Saving the preprocessed data to a CSV file for future use.


In [6]:
df.to_csv("../data/processed/summary_of_weather_processed.csv", index=False)
print("Data processed and saved to ../data/processed/summary_of_weather_processed.csv")

Data processed and saved to ../data/processed/summary_of_weather_processed.csv
