In [57]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import sklearn as sk
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# import package to count number of hours after specific time
from datetime import datetime
import torch
import torch.nn as nn
import torch.optim as optim

### **Cleaning and Converting the datatype**

In [58]:

# Faerier, adjusts for modulation, 7, 31
# Plot residuals to Faerier plot

# Define file paths
combined_dataset = "Data/DataForModel.csv"

# Load data
data = pd.read_csv(combined_dataset)

In [59]:
data.columns

Index(['Measurement', 'SiteCode', 'LocalAuthorityCode', 'SiteType',
       'SpeciesType', 'SatMean', 'SatBand', 'FlowMean', 'temperature_2m (°C)',
       'relative_humidity_2m (%)', 'wind_direction_10m (°)', 'HourOfDay',
       'DayOfMonth', 'DayOfWeek', 'Distance', 'Bearing', 'Hours', 'Days'],
      dtype='object')

In [60]:
#对data中所有数值类型变量进行标准化处理

# Select columns to standardize
cols_to_standardize = ['Measurement', 'SatMean', 'FlowMean', 'temperature_2m (°C)', 'relative_humidity_2m (%)', 'wind_direction_10m (°)', 'HourOfDay', 'DayOfMonth', 'DayOfWeek', 'Distance', 'Bearing', 'Hours', 'Days']

# Standardize columns
scaler = StandardScaler()
data[cols_to_standardize] = scaler.fit_transform(data[cols_to_standardize])

# select all the categorical variables
cols_to_categorize = ['SiteCode', 'LocalAuthorityCode', 'SiteType','SpeciesType','SatBand']

In [61]:
for col in cols_to_categorize:
    data[col] = data[col].astype('category').cat.codes

In [91]:
# 将SatBand列中所有的-1值替换为4
data['SatBand'] = data['SatBand'].replace(-1, 4)

In [116]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389033 entries, 0 to 389032
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Measurement               389033 non-null  float64
 1   SiteCode                  389033 non-null  int8   
 2   LocalAuthorityCode        389033 non-null  int8   
 3   SiteType                  389033 non-null  int8   
 4   SpeciesType               389033 non-null  int8   
 5   SatMean                   389033 non-null  float64
 6   SatBand                   389033 non-null  int8   
 7   FlowMean                  389033 non-null  float64
 8   temperature_2m (°C)       389033 non-null  float64
 9   relative_humidity_2m (%)  389033 non-null  float64
 10  wind_direction_10m (°)    389033 non-null  float64
 11  HourOfDay                 389033 non-null  float64
 12  DayOfMonth                389033 non-null  float64
 13  DayOfWeek                 389033 non-null  f

In [109]:
for col in cols_to_categorize:
    print(data[col].min(), data[col].max(), data[col].nunique(), col)

0 96 97 SiteCode
0 28 29 LocalAuthorityCode
0 4 5 SiteType
0 3 4 SpeciesType
0 4 5 SatBand


### **We have successfully convert all variables into float or int, now we are going to re-organize the dataset**

In [95]:
# set the X and y for preparing the data before training the model
collist = data.columns.tolist()
collist.remove('Measurement')
X = data[collist].values
y = data['Measurement'].values 

In [96]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [97]:

def create_dataset(df, sequence_length):
    categorical_sequences = []
    continuous_sequences = []
    labels = []
    
    categorical_columns = [1, 2, 3, 4, 6]  
    continuous_columns = [i for i in range(1, df.shape[1]) if i not in categorical_columns]  

    for i in range(len(df) - sequence_length):
        seq = df.iloc[i:i+sequence_length] 
        
        # separate the sequence into categorical and continuous data
        categorical_seq = seq.iloc[:, categorical_columns].values
        continuous_seq = seq.iloc[:, continuous_columns].values
        
        # get label
        label = df.iloc[i+sequence_length, 0]  
        
        categorical_sequences.append(categorical_seq)
        continuous_sequences.append(continuous_seq)
        labels.append(label)
    
    return np.array(categorical_sequences), np.array(continuous_sequences), np.array(labels)

In [98]:
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1) # add one dimension to y

In [99]:


class TimeSeriesDataset(Dataset):
    def __init__(self, X_categorical, X_continuous, y):
        self.X_categorical = X_categorical
        self.X_continuous = X_continuous
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X_categorical[idx], self.X_continuous[idx], self.y[idx]
    


In [100]:
sequence_length = 24  # 时间窗口长度
X_categorical, X_continuous, y = create_dataset(data, sequence_length)

dataset = TimeSeriesDataset(X_categorical, X_continuous, y)


In [101]:
# 计算训练集大小
train_size = int(len(y) * 0.8)  # 保留80%的数据用于训练

# 分割数据
X_cat_train, X_cat_test = X_categorical[:train_size], X_categorical[train_size:]
X_cont_train, X_cont_test = X_continuous[:train_size], X_continuous[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# 转换为PyTorch张量
import torch
from torch.utils.data import DataLoader

X_cat_train_tensor = torch.tensor(X_cat_train, dtype=torch.float32)
X_cont_train_tensor = torch.tensor(X_cont_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)

X_cat_test_tensor = torch.tensor(X_cat_test, dtype=torch.float32)
X_cont_test_tensor = torch.tensor(X_cont_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# 创建DataLoader
train_dataset = TimeSeriesDataset(X_cat_train_tensor, X_cont_train_tensor, y_train_tensor)
test_dataset = TimeSeriesDataset(X_cat_test_tensor, X_cont_test_tensor, y_test_tensor)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False)  # 时间序列数据通常不应该打乱
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [102]:
# get the nunique of each categorical variable
num_unique_site_codes = data['SiteCode'].nunique()
num_unique_local_authority_codes = data['LocalAuthorityCode'].nunique()
num_unique_site_types = data['SiteType'].nunique()
num_unique_species_types = data['SpeciesType'].nunique()
num_unique_sat_bands = data['SatBand'].nunique()

In [117]:
import torch
import torch.nn as nn

class MyLSTMModel(nn.Module):
    def __init__(self, num_continuous, embedding_dim_dict, lstm_hidden_dim, lstm_layers, output_size):
        super(MyLSTMModel, self).__init__()
        
        # Embedding Layer
        self.site_code_embedding = nn.Embedding(num_embeddings=embedding_dim_dict['SiteCode'][0], embedding_dim=embedding_dim_dict['SiteCode'][1])
        self.local_authority_code_embedding = nn.Embedding(num_embeddings=embedding_dim_dict['LocalAuthorityCode'][0], embedding_dim=embedding_dim_dict['LocalAuthorityCode'][1])
        self.site_type_embedding = nn.Embedding(num_embeddings=embedding_dim_dict['SiteType'][0], embedding_dim=embedding_dim_dict['SiteType'][1])
        self.species_type_embedding = nn.Embedding(num_embeddings=embedding_dim_dict['SpeciesType'][0], embedding_dim=embedding_dim_dict['SpeciesType'][1])
        self.sat_band_embedding = nn.Embedding(num_embeddings=embedding_dim_dict['SatBand'][0], embedding_dim=embedding_dim_dict['SatBand'][1])
        
        # LSTM Layer
        self.lstm = nn.LSTM(input_size=sum([ed[1] for ed in embedding_dim_dict.values()]) + num_continuous,
                            hidden_size=lstm_hidden_dim,
                            num_layers=lstm_layers,
                            batch_first=True)
        
        # Fully Connected Layer
        self.linear = nn.Linear(lstm_hidden_dim, output_size)
    
    def forward(self, x_categorical, x_continuous):
        
        # 打印x_categorical的最小和最大值
        print("x_categorical min:", x_categorical.min(dim=0))
        print("x_categorical max:", x_categorical.max(dim=0))

        # 将分类特征的浮点张量转换为长整型张量
        
        
        x_categorical = x_categorical.long()  # 这是关键的转换步骤
        
        embeddings = [
            self.site_code_embedding(x_categorical[:, 0]),
            self.local_authority_code_embedding(x_categorical[:, 1]),
            self.site_type_embedding(x_categorical[:, 2]),
            self.species_type_embedding(x_categorical[:, 3]),
            self.sat_band_embedding(x_categorical[:, 4])
        ]

        # Merging along with the feature's dimension
        x_embedded = torch.cat(embeddings, dim=2)  
        # merging with continuous features
        x_combined = torch.cat([x_embedded, x_continuous], dim=2)  
        
        lstm_out, (h_n, c_n) = self.lstm(x_combined)
        output = self.linear(lstm_out[:, -1, :])  
        
        return output

### Define the hyperparameters

In [122]:
# 
number_of_continuous_features = 12
embedding_dim_dict={
    'SiteCode': (num_unique_site_codes, 98),
    'LocalAuthorityCode': (num_unique_local_authority_codes, 30),
    'SiteType': (num_unique_site_types, 6),
    'SpeciesType': (num_unique_species_types, 5),
    'SatBand': (num_unique_sat_bands, 6)
}
lstm_hidden_dim = 128
lstm_layers = 2


### 实例化模型 

In [124]:
model = MyLSTMModel(num_continuous=number_of_continuous_features,
                    embedding_dim_dict=embedding_dim_dict,
                    lstm_hidden_dim=lstm_hidden_dim,
                    lstm_layers=lstm_layers,
                    output_size=1)

criterion = nn.MSELoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    model.train()
    for i, (x_categorical, x_continuous, y) in enumerate(train_dataloader):
        if i == 0:  # 仅在每个epoch的开始打印一次
            print(f"x_categorical shape: {x_categorical.shape}")
            print(f"x_continuous shape: {x_continuous.shape}")
            print(f"y shape: {y.shape}")
            # 打印数据类型也很重要
            print(f"x_categorical dtype: {x_categorical.dtype}")
            print(f"x_continuous dtype: {x_continuous.dtype}")
            print(f"y dtype: {y.dtype}")
        optimizer.zero_grad()
        predictions = model(x_categorical, x_continuous)
        loss = criterion(predictions, y)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

x_categorical shape: torch.Size([32, 24, 5])
x_continuous shape: torch.Size([32, 24, 12])
y shape: torch.Size([32, 1])
x_categorical dtype: torch.float32
x_continuous dtype: torch.float32
y dtype: torch.float32
x_categorical min: torch.return_types.min(
values=tensor([[83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
    

IndexError: index out of range in self

In [123]:
model = MyLSTMModel(num_continuous=number_of_continuous_features,
                    embedding_dim_dict=embedding_dim_dict,
                    lstm_hidden_dim=lstm_hidden_dim,
                    lstm_layers=lstm_layers,
                    output_size=1)

criterion = nn.MSELoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    model.train()
    for x_categorical, x_continuous, y in train_dataloader:
        optimizer.zero_grad()
        predictions = model(x_categorical, x_continuous)
        loss = criterion(predictions, y)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


x_categorical min: torch.return_types.min(
values=tensor([[83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.],
        [83., 25.,  0.,  0.,  0.]]),
indices=tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],


IndexError: index out of range in self

### Model Evaluation

In [None]:
model.eval()
with torch.no_grad():
    # 假设有test_dataloader
    total_loss = 0
    for x_categorical, x_continuous, y in test_dataloader:
        predictions = model(x_categorical, x_continuous)
        loss = criterion(predictions, y)
        total_loss += loss.item()
    
    print(f'Test Loss: {total_loss / len(test_dataloader)}')


In [None]:
model = MyLSTMModel(input_dim=X_train.shape[1], hidden_dim=500)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
model.to(device)
epochs = 100

for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        # 将数据移到GPU
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

In [None]:
model.eval()
predictions = []
actuals = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs.unsqueeze(1))
        predictions.append(outputs.numpy())
        actuals.append(labels.numpy())

predictions = np.vstack(predictions)
actuals = np.vstack(actuals)

# 计算MSE
mse = mean_squared_error(actuals, predictions)
print(f'Mean Squared Error: {mse}')

# 绘制残差图
residuals = actuals - predictions
plt.scatter(actuals, residuals)
plt.axhline(y=0, color='r', linestyle='-')
plt.xlabel('Actual Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
