In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.autograd import Variable
import os
import sys

current_dir = os.getcwd()
sys.path.append(os.path.dirname(current_dir))

from code_base.weight_sharing_utils import *


In [2]:
import numpy as np 
import random 
def set_seed(seed=42):
    torch.manual_seed(seed)  # Set PyTorch seed
    np.random.seed(seed)      # Set NumPy seed
    random.seed(seed)         # Set Python random seed

    # Ensure reproducibility for CUDA (if using GPU)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # For multi-GPU setups
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False  # Disable to ensure consistent runs

# Set a fixed seed
set_seed(58) #56 

## Data processing

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from geopy.distance import great_circle

import torch
import torch.utils.data as Data
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.autograd import Variable
# from torchsummary import summary
import datetime

import os
import random

# forecast 24-hour lead time 
pre_seq = 4
batch_size = 128
epochs = 128
min_val_loss = 100
model_name = '../results/model_saver/Classl_Model_weight_sharing_1.pkl'
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [4]:
train = pd.read_csv('../data/CMA_train_'+str(pre_seq*6)+'h.csv', header=None)
test = pd.read_csv('../data/CMA_test_'+str(pre_seq*6)+'h.csv', header=None)

CLIPER_feature =  pd.concat((train, test), axis=0)
CLIPER_feature.reset_index(drop=True, inplace=True)

X_wide_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

X_wide = X_wide_scaler.fit_transform(CLIPER_feature.iloc[:, 6:])
X_wide_train = X_wide[0: train.shape[0], :]

y = y_scaler.fit_transform(CLIPER_feature.loc[:, 3:4])
y_train = y[0: train.shape[0], :]

reanalysis_type = 'z'

# 0 means now 
# 1 means 6-hour ago
# 2 means 12-hour ago
ahead_times = [0,1,2,3]
pressures = [1000, 750, 500, 250]
sequential_reanalysis_list = []
reanalysis_test_dict = {}
X_deep_scaler_dict = {}


for ahead_time in ahead_times:

    reanalysis_list = []
    for pressure in pressures:
        
        folder = None
        if ahead_time == 0:
            folder = reanalysis_type
        else:
            folder = reanalysis_type + '_' + str(ahead_time*6)
        train_reanalysis_csv = pd.read_csv('../data/ERA_Interim/'+folder+'/'+reanalysis_type+str(pressure)+'_train_31_31.csv', header=None)
        test_reanalysis_csv = pd.read_csv('../data/ERA_Interim/'+folder+'/'+reanalysis_type+str(pressure)+'_test_31_31.csv', header=None)
        
        train_reanalysis = train_reanalysis_csv[train_reanalysis_csv[0].isin(train[0].unique())]
        test_reanalysis = test_reanalysis_csv[test_reanalysis_csv[0].isin(test[0].unique())]
        reanalysis_test_dict[reanalysis_type+str(pressure)+str(ahead_time)] = test_reanalysis
        
        reanalysis =  pd.concat((train_reanalysis, test_reanalysis), axis=0)
        reanalysis.reset_index(drop=True, inplace=True)
        
        scaler_name = reanalysis_type +str(pressure) + str(ahead_time)
        X_deep_scaler_dict[scaler_name] = MinMaxScaler()
        X_deep = X_deep_scaler_dict[scaler_name] .fit_transform(reanalysis.loc[:, 5:])
        
        X_deep_final = X_deep[0: train.shape[0], :].reshape(-1, 1, 1, 31, 31)
        reanalysis_list.append(X_deep_final)
    
    X_deep_temp = np.concatenate(reanalysis_list[:], axis=2)
    print("ahead_time:", ahead_time, X_deep_temp.shape)
    sequential_reanalysis_list.append(X_deep_temp)

X_deep_train = np.concatenate(sequential_reanalysis_list, axis=1)

ahead_time: 0 (8406, 1, 4, 31, 31)
ahead_time: 1 (8406, 1, 4, 31, 31)
ahead_time: 2 (8406, 1, 4, 31, 31)
ahead_time: 3 (8406, 1, 4, 31, 31)


## Construction of training set and validation set

In [5]:
full_train_index = [*range(0, len(X_wide_train))]

train_index, val_index, _, _, = train_test_split(full_train_index,full_train_index,test_size=0.1)

len(train_index), len(val_index)

train_dataset = torch.utils.data.DataLoader(
    TrainLoader(X_wide_train[train_index], X_deep_train[train_index], y_train[train_index]), 
                                                 batch_size=batch_size, shuffle=True)

val_dataset = torch.utils.data.DataLoader(
    TrainLoader(X_wide_train[val_index], X_deep_train[val_index], y_train[val_index]), 
                                                 batch_size=batch_size, shuffle=True)

## Training

In [6]:
net = Net()
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

print("# of trainable parameters in the current model: ",
    sum(p.numel() for p in net.parameters() if p.requires_grad)
)

# wo weight sharing: 8399540

# of trainable parameters in the current model:  4964276


In [7]:
full_train_index = [*range(0, len(X_wide_train))]

for epoch in range(epochs):  # loop over the dataset multiple times
    starttime = datetime.datetime.now()
    train_index, val_index, _, _, = train_test_split(full_train_index,full_train_index,test_size=0.1)
    train_dataset = torch.utils.data.DataLoader(
        TrainLoader(X_wide_train[train_index], X_deep_train[train_index], y_train[train_index]), 
                                                 batch_size=batch_size,)
    val_dataset = torch.utils.data.DataLoader(
        TrainLoader(X_wide_train[val_index], X_deep_train[val_index], y_train[val_index]), 
                                                 batch_size=batch_size,)
    # training
    total_train_loss = 0
    for step, (batch_x, batch_y) in enumerate(train_dataset):
        if torch.cuda.is_available():
            net.cuda()
            X_wide_train_cuda = batch_x[0].float().cuda()
            X_deep_train_cuda = batch_x[1].float().cuda()
            y_train_cuda = batch_y.cuda()
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        pred_y = net(X_wide_train_cuda, X_deep_train_cuda)
        loss = criterion(pred_y, y_train_cuda)
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    # validation
    total_val_loss = 0
    for _,(batch_val_x, batch_val_y) in enumerate(val_dataset):
        
        if torch.cuda.is_available():
            X_wide_val_cuda = batch_val_x[0].float().cuda()
            X_deep_val_cuda = batch_val_x[1].float().cuda()
            y_val_cuda = batch_val_y.cuda()
        
        pred_y = net(X_wide_val_cuda, X_deep_val_cuda)
        val_loss = criterion(pred_y, y_val_cuda)
        total_val_loss += val_loss.item()
    
        # print statistics
    if min_val_loss > total_val_loss:
        torch.save(net.state_dict(), model_name)
        min_val_loss = total_val_loss
    endtime = datetime.datetime.now()
    print('epochs [%d/%d] cost:%.2fs train_loss: %.5f val_loss: %.5f' % 
          (epoch + 1, epochs, (endtime-starttime).seconds, total_train_loss, total_val_loss))

print('Finished Training')


epochs [1/128] cost:17.00s train_loss: 8.17645 val_loss: 0.50911
epochs [2/128] cost:17.00s train_loss: 3.08029 val_loss: 0.20738
epochs [3/128] cost:17.00s train_loss: 1.38346 val_loss: 0.13878
epochs [4/128] cost:17.00s train_loss: 1.08606 val_loss: 0.11739
epochs [5/128] cost:17.00s train_loss: 1.00837 val_loss: 0.11745
epochs [6/128] cost:17.00s train_loss: 0.99551 val_loss: 0.12540
epochs [7/128] cost:17.00s train_loss: 0.95597 val_loss: 0.13504
epochs [8/128] cost:17.00s train_loss: 0.97118 val_loss: 0.11511
epochs [9/128] cost:17.00s train_loss: 0.92220 val_loss: 0.11703
epochs [10/128] cost:17.00s train_loss: 0.92156 val_loss: 0.10003
epochs [11/128] cost:17.00s train_loss: 0.89002 val_loss: 0.12263
epochs [12/128] cost:17.00s train_loss: 0.91467 val_loss: 0.10151
epochs [13/128] cost:17.00s train_loss: 0.88873 val_loss: 0.09880
epochs [14/128] cost:17.00s train_loss: 0.84044 val_loss: 0.10329
epochs [15/128] cost:17.00s train_loss: 0.83536 val_loss: 0.09991
epochs [16/128] cos

## Testing

In [8]:
years = test[5].unique()
test_list = []

for year in years:
    temp = test[test[5]==year]
    temp = temp.reset_index(drop=True)
    test_list.append(temp)
    

torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net = Net()
net = net.to(device)
net.load_state_dict(torch.load(model_name))

<All keys matched successfully>

In [9]:

tid_list = []
time_list = [] 
pred_lat_list = []
pred_long_list = [] 
true_lat_list = [] 
true_long_list = []


with torch.no_grad():
    for year, _test in zip(years, test_list):

        print('Year:', year)
        # print("TID ", _test.loc[:,1])
        y_test_lat = _test.loc[:,3]
        
        y_test_long = _test.loc[:,4]
        
        X_wide_test = X_wide_scaler.transform(_test.loc[:,6:])

        final_test_list = []
        for ahead_time in ahead_times:
            year_test_list = []
            for pressure in pressures:
                scaler_name = reanalysis_type +str(pressure) + str(ahead_time)
                X_deep = reanalysis_test_dict[scaler_name][reanalysis_test_dict[scaler_name][0].isin(_test[0].unique())].loc[:,5:]
                X_deep = X_deep_scaler_dict[scaler_name].transform(X_deep)
                X_deep_final = X_deep.reshape(-1, 1, 1, 31, 31)
                year_test_list.append(X_deep_final)
            X_deep_temp = np.concatenate(year_test_list, axis=2)
            final_test_list.append(X_deep_temp)
        X_deep_test = np.concatenate(final_test_list, axis=1)

        if torch.cuda.is_available():
            X_wide_test = Variable(torch.from_numpy(X_wide_test).float().cuda())
            X_deep_test = Variable(torch.from_numpy(X_deep_test).float().cuda())

        
        tid  = _test.loc[:,1]
        time_ = _test.loc[:,2]
        print("len(tid) = ",len(tid))
        pred = net(X_wide_test, X_deep_test)

        pred = y_scaler.inverse_transform(pred.cpu().detach().numpy())

        pred_lat = pred[:,0]
        pred_long = pred[:,1]
        
        print("len(pred_lat) =", len(pred_lat))
        true_lat = y_test_lat
        true_long = y_test_long

        diff_lat = np.abs(pred_lat - true_lat)
        diff_long = np.abs(pred_long - true_long)

        print('avg lat:', sum(diff_lat)/len(diff_lat))
        print('avg long:', sum(diff_long)/len(diff_long))

        sum_error = []
        for i in range(0, len(pred_lat)):
            sum_error.append(great_circle((pred_lat[i], pred_long[i]), (true_lat[i], true_long[i])).kilometers)

        print('avg distance error:', sum(sum_error)/len(sum_error))
        
        tid_list.append(tid)
        time_list.append(time_)
        pred_lat_list.append(pred_lat)
        pred_long_list.append(pred_long)
        true_lat_list.append(true_lat)
        true_long_list.append(true_long)
        

tid_list_ =  [item for sublist in tid_list for item in sublist]
time_list_ =  [item for sublist in time_list for item in sublist]
pred_lat_list_ =  [item for sublist in pred_lat_list for item in sublist]
pred_long_list_ =  [item for sublist in pred_long_list for item in sublist]
true_lat_list_ =  [item for sublist in true_lat_list for item in sublist]
true_long_list_ =  [item for sublist in true_long_list for item in sublist]



Year: 2015
len(tid) =  908
len(pred_lat) = 908
avg lat: 0.7601423190028656
avg long: 0.9298767241087241
avg distance error: 138.34627224805465
Year: 2016
len(tid) =  489
len(pred_lat) = 489
avg lat: 0.8517363261591433
avg long: 0.9880436559883833
avg distance error: 148.74779501243424
Year: 2017
len(tid) =  544
len(pred_lat) = 544
avg lat: 0.8153262217255197
avg long: 1.0541587801540604
avg distance error: 153.21660248050534
Year: 2018
len(tid) =  806
len(pred_lat) = 806
avg lat: 0.8926407771430007
avg long: 1.1295992754233382
avg distance error: 164.3510691757483


In [10]:
id_key = pd.read_csv('../data/raw.csv', header=None)
track_data = [] 

for i in range(len(tid_list_)):

    if len(id_key[id_key[0] == str(tid_list_[i])][11].unique()) == 0:

        track_data.append([
            tid_list_[i], 
            id_key[id_key[0] == tid_list_[i]][11].unique()[0],
            time_list_[i],
            true_lat_list_[i],
            true_long_list_[i],
            pred_lat_list_[i],
            pred_long_list_[i]
            ])
        
    else:

        track_data.append([
            tid_list_[i], 
            id_key[id_key[0] == str(tid_list_[i])][11].unique()[0],
            time_list_[i],
            true_lat_list_[i],
            true_long_list_[i],
            pred_lat_list_[i],
            pred_long_list_[i]
            ])


import csv

file_path = "../results/QPA_track_data/track_data_classical_weight_sharing_1.csv"
# Define the column headers
headers = ["TID", "KEY", "TIME", "LAT", "LONG", "PRED_LAT", "PRED_LONG"] 

# Write to CSV
with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # Write the header
    writer.writerow(headers)
    
    # Write the data rows
    writer.writerows(track_data)

print(f"CSV file has been written to {file_path}")


  id_key = pd.read_csv('../data/raw.csv', header=None)


CSV file has been written to ../results/QPA_track_data/track_data_classical_weight_sharing_1.csv
