In [1]:
!module load cuda

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image
import torchvision.transforms as T

import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

from tqdm import tqdm, trange

from datetime import datetime, timedelta

In [3]:
class WeatherDataset(Dataset):    
    def __init__(self, weather_data = 'data/big_arpafvg.csv', img_dir = 'data/cut_images', seq_length = 7):
        
        # Load weather data
        weather_data=pd.read_csv(weather_data)
        # Drop columns that are not useful
        weather_data = weather_data.drop(columns=['Temp. min gradi C','Temp. med gradi C','Temp. max gradi C','Vento med km/h','Dir. V. max gradi N'])
        # Normalize data
        for col in weather_data.columns:
            if col != 'giorno' and col != 'mese' and col != 'anno':
                weather_data[col] = (weather_data[col] - weather_data[col].mean()) / weather_data[col].std()

        
        
        self.weather_data = weather_data
        self.img_dir = img_dir
        self.seq_length = seq_length
        self.target_column_index = None
        self.date_generated = []

    def __getitem__(self, start_date):
        """
        Gets the image and the weather data for a given start_date
        Returns: [image, weather_data]
        image: torch.tensor
        weather_data: torch.tensor
        """

        # Get day, month, year from start_date in format dd-mm-yyyy
        day, month, year = start_date.split('_')

        # Get from weather data the row with the such start_date
        weather_data = self.weather_data[(self.weather_data['giorno'] == int(day)) & (self.weather_data['mese'] == int(month)) & (self.weather_data['anno'] == int(year))]
        # One hot encoding for the month column
        weather_data = pd.get_dummies(weather_data, columns=['mese'], dtype=int)
        #Get the index of the targe column "log(Pioggia mm)"
        self.target_colum_index = weather_data.columns.get_loc('log(Pioggia mm)')
        # Torchify the data
        weather_data = torch.tensor(weather_data.values[0])

        # Get the image for the such start_date
        image = os.path.join(self.img_dir, str(start_date) + '.jpg')  
        image = read_image(image)

        return [image, weather_data]
    
    def __len__(self):
        return len(self.weather_data)

    def date_generation(self, start_date, end_date):
        """
        Generates a list of dates between start_date and end_date
        The dates are in format dd-mm-yyyy
        Returns: list of strings
        """

        start_date = datetime.strptime(start_date, "%d_%m_%Y")
        end_date = datetime.strptime(end_date, "%d_%m_%Y")

        # Generate a list of datetime objects
        date_generated = [start_date + timedelta(days=x) for x in range(0, (end_date-start_date).days)]

        #Transform the list of datetime objects in a list of strings in format dd-mm-yyyy
        date_generated = [start_date.strftime("%d_%m_%Y") for start_date in date_generated]
        self.date_generated=date_generated

        return date_generated
    
    def create_sequence(self, dates_list, len_seq):
        """
        This method gets a list of dates in format dd_mm_yyyy and 
        creates a sequence of images, weather data and target values of len_seq days.
        
        Returns: batch_imgs, batch_xs, batch_ys

        batch_imgs: torch.tensor of shape (len(dates_list), len_seq, n_channels, height, width)
        batch_xs: torch.tensor of shape (len(dates_list), len_seq, n_features_weather_data)
        batch_ys: torch.tensor of shape (len(dates_list), 1)

        """

        batch_imgs = []
        batch_xs =[]
        batch_ys = []
        
        for start_date in dates_list:
            
            # Get the end date of the sequence
            end_date = start_date.split('_')
            end_date = datetime(int(end_date[2]), int(end_date[1]), int(end_date[0])) + timedelta(days = len_seq)
            end_date = end_date.strftime("%d_%m_%Y")
            
            # List of images, weather data and target values
            imgs, xs, ys = [], [], []

            # Generate the list of dates between start_date and end_date            
            date_sequence = self.date_generation(start_date, end_date)
            
            # Append the image and weather data to imgs and xs
            for day in date_sequence:
                img, x = self.__getitem__(day)
                imgs.append(img)
                xs.append(x)

            # Get the day after the end_date
            end_date_next = end_date.split('_')
            end_date_next = datetime(int(end_date_next[2]), int(end_date_next[1]), int(end_date_next[0])) + timedelta(days = 1)
            end_date_next = end_date_next.strftime("%d_%m_%Y")

            # Append the target value to ys
            ys.append(self.__getitem__(end_date_next)[1][self.target_column_index])

            # Convert the lists to torch tensors
            imgs = torch.from_numpy(np.array(imgs)).float()
            xs = torch.from_numpy(np.array(xs)).float()
            ys = torch.from_numpy(np.array(ys)).float()
            
            # Append the tensors to the batch
            batch_xs.append(xs)
            batch_imgs.append(imgs)
            batch_ys.append(ys)

        # Convert the batches to torch tensors
        batch_imgs = torch.stack(batch_imgs)
        batch_xs = torch.stack(batch_xs)
        batch_ys = torch.stack(batch_ys)
        
        return batch_imgs, batch_xs, batch_ys
    
   

In [4]:
dataset = WeatherDataset()

In [5]:
# Generate a list of strings dd-mm-yyyy from 01-06-2023 to 15-6-2024
dataset.date_generation("01_01_2022", "15_06_2024")
date_generated = dataset.date_generated

In [6]:
# Train-validation-test split
train_len = int(0.7 * len(dataset.date_generated))
valid_len = int(0.15 * len(dataset.date_generated))
test_len = len(dataset.date_generated) - train_len - valid_len

# Create date train, valdiation, test lists by random sampling
date_trainset = np.random.choice(date_generated, train_len, replace=False)
date_valid = np.random.choice([date for date in date_generated if date not in date_trainset], valid_len, replace=False)
date_testset = [date for date in date_generated if date not in date_trainset and date not in date_valid]

# Define the loaders for the train, validation, test sets, they will provide dates in format dd_mm_yyyy
train_loader = torch.utils.data.DataLoader(date_trainset, batch_size = 5, shuffle = True)
valid_loader = torch.utils.data.DataLoader(date_valid, batch_size = 5, shuffle = True)
test_loader = torch.utils.data.DataLoader(date_testset, batch_size = 5, shuffle = True) 

In [7]:
class DeepWeather(nn.Module):
    def __init__(self):
        super(DeepWeather, self).__init__()

        self.conv1 = nn.Conv3d(in_channels = 7, out_channels = 8, kernel_size = (1, 3, 3), stride = 1)
        self.pool = nn.MaxPool3d(kernel_size=(1,2,2), padding=(0,1,1))
        self.dropout = nn.Dropout(p = 0.3)
        self.bn1 = nn.BatchNorm3d(8)
        self.conv2 = nn.Conv3d(in_channels = 8, out_channels = 16, kernel_size = (1, 3, 3), stride = 1)
        self.bn2 = nn.BatchNorm3d(16)
        self.conv3 = nn.Conv3d(in_channels = 16, out_channels = 32, kernel_size = (1, 3, 3), stride = 1)
        self.bn3 = nn.BatchNorm3d(32)
        self.conv4 = nn.Conv3d(in_channels = 32, out_channels = 64, kernel_size = (1, 3, 3), stride = 1)
        self.bn4 = nn.BatchNorm3d(64)
        self.conv5 = nn.Conv3d(in_channels = 64, out_channels = 128, kernel_size = (1, 3, 3), stride = 1)
        self.bn5 = nn.BatchNorm3d(128)
        self.conv6 = nn.Conv3d(in_channels = 128, out_channels = 7, kernel_size = (1, 3, 3), stride = 1)
        self.bn6 = nn.BatchNorm3d(7)
        # channels, height, width = 3, 6, 8

        self.input_size = 10 # number of features in the input (weather data)
        self.hidden_size = 4
        self.num_layers = 1
        self.rnn = nn.RNN(self.input_size, self.hidden_size, self.num_layers, batch_first=True)

        # dim input fc = seq_len * 3 * 6 * 8 + hidden_size = 1024
        self.seq_len = 7
        self.fc = nn.Linear(self.seq_len*3*6*8+self.hidden_size, 1)

    def forward(self, x1, x2):
        
        x1 = self.bn1(self.dropout(self.pool(F.leaky_relu(self.conv1(x1)))))
        x1 = self.bn2(self.dropout(self.pool(F.leaky_relu(self.conv2(x1)))))
        x1 = self.bn3(self.dropout(self.pool(F.leaky_relu(self.conv3(x1)))))
        x1 = self.bn4(self.dropout(self.pool(F.leaky_relu(self.conv4(x1)))))
        x1 = self.bn5(self.dropout(self.pool(F.leaky_relu(self.conv5(x1)))))
        x1 = self.bn6(self.dropout(self.pool(F.leaky_relu(self.conv6(x1)))))
        print("x1 shape:", x1.shape)
        out_cnn = torch.flatten(x1, start_dim = 1)

        h0 = torch.zeros(self.num_layers, x2.size(0),self.hidden_size).to(x2.device)
        out_rnn, _ = self.rnn(x2, h0)

        out = torch.cat((out_rnn[:,-1,:], out_cnn), dim = 1)
        print("out_rnn[:,-1,:] shape:", out_rnn[:,-1,:].shape)
        out = self.fc(out)

        return out

In [8]:
def train(model, dataset, train_loader, test_loader, criterion, optimizer, epochs = 50, first_time = True, num_saved_epochs = 0):
    
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = model.to(device)    
    
    train_loss_epochs = []
    valid_loss_epochs = []
    bar = trange(epochs, desc=f"Epoch ?/?, Train Loss: ?, Test Loss: ?")
    
    for epoch in bar:
        model.train()
        train_losses = []
            
        for date in train_loader:
            imgs, xs, ys = dataset.create_sequence(date, 7)
            imgs = imgs.to(device)
            xs = xs.to(device)
            ys = ys.to(device)

            optimizer.zero_grad()
            outputs = model(imgs, xs)
            loss = criterion(outputs, ys)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
            
        train_loss_epochs.append(np.mean(train_losses))
        
        model.eval()
        valid_losses = []
        for date in valid_loader:
            
            imgs, xs, ys = dataset.create_sequence(date, 7)
            imgs = imgs.to(device)
            xs = xs.to(device)
            ys = ys.to(device)
            outputs = model(imgs, xs)
            loss = criterion(outputs.squeeze(), ys)
            valid_losses.append(loss.item())
                
        valid_loss_epochs.append(np.mean(valid_losses))
        bar.set_description(f"Epoch {epoch + 1}/{epochs}, Train Loss: {np.mean(train_losses)}, Validation Loss: {np.mean(valid_losses)}")

    return train_loss_epochs, valid_loss_epochs

In [9]:
model = DeepWeather()
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 10e-6, weight_decay=1e-5)

In [10]:
train_losses, val_losses = train(model, dataset, train_loader, test_loader, criterion, optimizer, epochs = 50, first_time = True, num_saved_epochs = 0)

Epoch ?/?, Train Loss: ?, Test Loss: ?:   0%|                                             | 0/50 [00:00<?, ?it/s]

x1 shape: torch.Size([5, 7, 3, 6, 8])
out_rnn[:,-1,:] shape: torch.Size([5, 4])


  return F.mse_loss(input, target, reduction=self.reduction)


x1 shape: torch.Size([5, 7, 3, 6, 8])
out_rnn[:,-1,:] shape: torch.Size([5, 4])


  return F.mse_loss(input, target, reduction=self.reduction)


x1 shape: torch.Size([5, 7, 3, 6, 8])
out_rnn[:,-1,:] shape: torch.Size([5, 4])


  return F.mse_loss(input, target, reduction=self.reduction)


x1 shape: torch.Size([5, 7, 3, 6, 8])
out_rnn[:,-1,:] shape: torch.Size([5, 4])


  return F.mse_loss(input, target, reduction=self.reduction)


x1 shape: torch.Size([5, 7, 3, 6, 8])
out_rnn[:,-1,:] shape: torch.Size([5, 4])


  return F.mse_loss(input, target, reduction=self.reduction)


x1 shape: torch.Size([5, 7, 3, 6, 8])
out_rnn[:,-1,:] shape: torch.Size([5, 4])


  return F.mse_loss(input, target, reduction=self.reduction)


x1 shape: torch.Size([5, 7, 3, 6, 8])
out_rnn[:,-1,:] shape: torch.Size([5, 4])


  return F.mse_loss(input, target, reduction=self.reduction)


x1 shape: torch.Size([5, 7, 3, 6, 8])
out_rnn[:,-1,:] shape: torch.Size([5, 4])


  return F.mse_loss(input, target, reduction=self.reduction)


x1 shape: torch.Size([5, 7, 3, 6, 8])
out_rnn[:,-1,:] shape: torch.Size([5, 4])


  return F.mse_loss(input, target, reduction=self.reduction)


x1 shape: torch.Size([5, 7, 3, 6, 8])
out_rnn[:,-1,:] shape: torch.Size([5, 4])


  return F.mse_loss(input, target, reduction=self.reduction)


x1 shape: torch.Size([5, 7, 3, 6, 8])
out_rnn[:,-1,:] shape: torch.Size([5, 4])


  return F.mse_loss(input, target, reduction=self.reduction)


x1 shape: torch.Size([5, 7, 3, 6, 8])
out_rnn[:,-1,:] shape: torch.Size([5, 4])


  return F.mse_loss(input, target, reduction=self.reduction)
Epoch ?/?, Train Loss: ?, Test Loss: ?:   0%|                                             | 0/50 [00:23<?, ?it/s]


RuntimeError: [Errno 2] No such file or directory: 'data/cut_images/17_06_2024.jpg'

In [None]:
epochs = [epoch for epoch in range(50)]
plt.plot(epochs, train_losses, label = 'Training Loss')
plt.plot(epochs, val_losses, label = 'Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Test the model using the test loader
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)  
model.eval()
test_losses = []
true = []
predicted = []
for date in test_loader:
    imgs, xs, ys = dataset.create_sequence(date, 7)
    imgs = imgs.to(device)
    xs = xs.to(device)
    ys = ys.to(device)
    outputs = model(imgs, xs)
    true.append(ys.cpu().detach().numpy())
    predicted.append(outputs.cpu().detach().numpy())
    loss = criterion(outputs.squeeze(), ys)
    test_losses.append(loss.item())
        
print(f"Mean Test Loss: {np.mean(test_losses)}")


In [None]:
# Plot