In [1]:
from google.colab import drive
drive.mount('/content/MyDrive')

Mounted at /content/MyDrive


In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


### 1. DATA PREPROCESSING ###
file_path = "/content/MyDrive/MyDrive/MScThesisAttenNEATS/data_daily.csv"
data = pd.read_csv(file_path)

# Convert and clean target
data["Avg Air Temp (F)"] = pd.to_numeric(data["Avg Air Temp (F)"], errors="coerce")
data = data.dropna(subset=["Avg Air Temp (F)"]).reset_index(drop=True)

# Date features
data["Date"] = pd.to_datetime(data["Date"], format="%m/%d/%Y")
data["year"]       = data["Date"].dt.year
data["month"]      = data["Date"].dt.month
data["day"]        = data["Date"].dt.day
data["dayofweek"]  = data["Date"].dt.dayofweek
data["dayofyear"]  = data["Date"].dt.dayofyear
data["weekofyear"] = data["Date"].dt.isocalendar().week

feature_cols = ["lag1", "lag2", "rolling_mean_3", "rolling_mean_7",
                "year", "month", "day", "dayofweek", "dayofyear", "weekofyear"]

# Lag and rolling
data["lag1"] = data["Avg Air Temp (F)"].shift(1)
data["lag2"] = data["Avg Air Temp (F)"].shift(2)
data["rolling_mean_3"] = data["Avg Air Temp (F)"].rolling(3).mean()
data["rolling_mean_7"] = data["Avg Air Temp (F)"].rolling(7).mean()
data = data.dropna().reset_index(drop=True)

# Normalize
temp_scaler = StandardScaler()
data["Avg Air Temp (F)"] = temp_scaler.fit_transform(data[["Avg Air Temp (F)"]])
feature_scaler = StandardScaler()
data[feature_cols] = feature_scaler.fit_transform(data[feature_cols])

# Train-test split
X = data[feature_cols].values
y = data["Avg Air Temp (F)"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)