In [None]:
# Pull out data from SQL into pandas 
# pd.read_sql()

# Add time 
# display(df_q)
dts = pd.to_datetime(df_q["date_time"])
df_q["date_time"] = dts
df_q['hour'] = df_q.date_time.dt.hour
# df_q = df.assign(hour=df_q.date_time.dt.hour)
# display(df_q)

df_q['day_of_year'] = df_q.date_time.dt.dayofyear

df_q['hour_sin'] = np.sin(2 * np.pi * df_q['hour'] / 24)
df_q['hour_cos'] = np.cos(2 * np.pi * df_q['hour'] / 24)
df_q['day_of_year_sin'] = np.sin(2 * np.pi * df_q['day_of_year'] / 365.25)
df_q['day_of_year_cos'] = np.cos(2 * np.pi * df_q['day_of_year'] / 365.25)

df_q

In [None]:
import torch.nn as nn
import torch

class MaskedRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MaskedRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, mask):
        method1 = True; method2 = False
        if method1:
            x = x * mask  # Element-wise multiplication
            h_0 = torch.zeros(1, x.size(0), self.hidden_size)
            out, _ = self.rnn(x.unsqueeze(1), h_0)  # Adding sequence dimension
            out = self.fc(out.squeeze(1))  # Removing sequence dimension
            return out
        elif method2:     
            x = x * mask  # Element-wise multiplication
            h_0 = torch.zeros(1, x.size(0), self.hidden_size)
            out, _ = self.rnn(x, h_0)
            out = self.fc(out[:, -1, :])
            return out


In [None]:
# inputs = ["elevation","air_temp","dew_point_temperature","pressure","wind_u","wind_v","air_density","hour_sin","hour_cos","day_of_year_sin","day_of_year_cos"]
inputs = ["elevation","dew_point_temperature","pressure","wind_u","wind_v","air_density","hour_sin","hour_cos","day_of_year_sin","day_of_year_cos"]
target = "air_temp"

# We want to include station "IDs" as one-hot codes
# Assuming df is your DataFrame and 'station_id' is the column with station IDs
df_one_hot = pd.get_dummies(df_q['stid'], prefix='station')
# Concatenate the one-hot encoded columns to original df
df_q = pd.concat([df_q, df_one_hot], axis=1)

# Now df will have additional columns, each representing a unique station ID
display(df_q)

# Impute missing values before conversion to PyTorch Tensors
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(df_q[inputs])
Y_imputed = df_q[target].fillna(df_q[target].mean())  # or another imputation strategy

# Convert to PyTorch Tensors
X = torch.tensor(X_imputed, dtype=torch.float32)
Y = torch.tensor(Y_imputed.values, dtype=torch.float32)

# Convert DataFrame to PyTorch Tensors
# X = torch.tensor(df_q[inputs].values, dtype=torch.float32)
# Y = torch.tensor(df_q[target].values, dtype=torch.float32)

# Create mask for missing values
mask = ~torch.isnan(X)  # Set True where data is NOT missing

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test, mask_train, mask_test = train_test_split(X, Y, mask, test_size=0.1)


imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

input_size = len(inputs)
hidden_size = 50
output_size = 1
model = MaskedRNN(input_size, hidden_size, output_size)

optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
loss_function = torch.nn.MSELoss()

epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()
    output = model(X_train, mask_train).squeeze(1)
    loss = loss_function(output, Y_train)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item()}')

# After forward pass
if torch.isnan(output).any():
    print('Output has NaN values')

# After backward pass
if torch.isnan(loss).any():
    print('Loss has NaN values')

for name, param in model.named_parameters():
    if torch.isnan(param.grad).any():
        print(f'Gradient for {name} contains NaNs.')

# Check if Y_train contains any NaNs
if torch.isnan(Y_train).any():
    print('Target variable Y_train contains NaN values')


In [None]:
# Let's do a test forecast.

# Using the previously scaled test data and mask
X_test_scaled = torch.tensor(scaler.transform(X_test), dtype=torch.float32)
# Note: No need to create a mask here, as you already have mask_test

model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    predicted_output = model(X_test_scaled, mask_test).squeeze(1)

from sklearn.metrics import mean_squared_error, mean_absolute_error

# Converting tensors to NumPy arrays for evaluation
Y_test_array = Y_test.numpy()
predicted_output_array = predicted_output.numpy()

# Computing the metrics
rmse = np.sqrt(mean_squared_error(Y_test_array, predicted_output_array))
mae = mean_absolute_error(Y_test_array, predicted_output_array)

print(f'Root Mean Squared Error: {rmse}')
print(f'Mean Absolute Error: {mae}')



In [None]:
# Now let's look forecast-by-forecast

import pandas as pd

# Convert PyTorch tensors to NumPy arrays
Y_test_array = Y_test.detach().cpu().numpy()
predicted_output_array = predicted_output.detach().cpu().numpy()

# Create a Pandas DataFrame
comparison_df = pd.DataFrame({
    'Actual': Y_test_array,
    'Predicted': predicted_output_array,
    'Error': Y_test_array - predicted_output_array
})

# Optionally, compute the absolute error and append to DataFrame
comparison_df['Absolute_Error'] = np.abs(comparison_df['Error'])

# Display the DataFrame
print(comparison_df.head(20))


In [None]:
# Fit a separate StandardScaler for the target
target_scaler = StandardScaler()
Y_train_reshaped = Y_train.detach().cpu().numpy().reshape(-1, 1)
target_scaler.fit(Y_train_reshaped)

# Inverse transform the predicted output
predicted_output_reshaped = predicted_output.detach().cpu().numpy().reshape(-1, 1)
predicted_output_original_scale = target_scaler.inverse_transform(predicted_output_reshaped)

# Create a DataFrame for easy comparison
comparison_df = pd.DataFrame({
    'Actual': Y_test.detach().cpu().numpy().reshape(-1, 1).squeeze(),
    # 'Actual': target_scaler.inverse_transform(Y_test.detach().cpu().numpy().reshape(-1, 1)).squeeze(),
    'Predicted': predicted_output_original_scale.squeeze(),
})

# Compute the error terms for each observation
comparison_df['Error'] = comparison_df['Actual'] - comparison_df['Predicted']

# Optionally, add a column for the absolute error
comparison_df['Absolute_Error'] = np.abs(comparison_df['Error'])

# Display the DataFrame
print(comparison_df.head(20))


In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df_q['stid_encoded'] = encoder.fit_transform(df_q['stid'])

class MaskedRNNWithEmbedding(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_stations):
        super(MaskedRNNWithEmbedding, self).__init__()
        
        # The "10" should be just len of other variables?
        self.embedding = nn.Embedding(num_stations, 10)  # 10 is the embedding dimension
        self.rnn = nn.RNN(input_size + 10, hidden_size)  # "+ 10" to account for the added embedding dimensions
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, x, station_ids, mask):
        station_embedding = self.embedding(station_ids)
        x_combined = torch.cat((x, station_embedding), dim=1)
        x = x_combined * mask  # Element-wise multiplication
        h_0 = torch.zeros(1, x.size(0), self.hidden_size)
        out, _ = self.rnn(x.unsqueeze(1), h_0)  # Adding sequence dimension
        out = self.fc(out.squeeze(1))  # Removing sequence dimension
        return out

station_ids = torch.tensor(df['stid_encoded'].values, dtype=torch.long)


output = model(X_train, station_ids_train, mask_train).squeeze(1)
