In [1]:
import os
import random
import math
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import r2_score
from IPython.display import clear_output as clear
import statsmodels.api as sm

In [2]:
class MYDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.images = df.iloc[:,5:].values
        self.coef = df.iloc[:,1:5].values
        self.labels = df.iloc[:, 0].values
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        coef = self.coef[idx]
        
        image = torch.tensor(image, dtype=torch.float)
        label = torch.tensor(label, dtype=torch.float)
        coef = torch.tensor(coef, dtype=torch.float)

        return image, coef, label

In [9]:
varName = ['fCO2', 'Chl', 'Temp', 'Salt']

dataset = pd.read_csv("D://CO2_data3.csv", encoding="utf-8")
dataset = dataset.dropna()
dataset = dataset[dataset.index % 4 == 0]

df0 = dataset['date'].str.split("/",expand = True)
df0.columns = ['year', 'month', 'date']

dataset['month'] = df0['month']
dataset = dataset[dataset.month == '7']

train_li = random.sample([i for i in range(0, dataset.shape[0])], int(0.8 * dataset.shape[0]))
train_li.sort()

j = 0
test_li = []

for i in range(0, dataset.shape[0], 1):
    if i != train_li[j] | j >= len(train_li):
        test_li.append(i)
    else:
        j = j + 1

train_set = dataset.iloc[train_li, :]
test_set  = dataset.iloc[test_li,  :]

mean_li = []
std_li = []

for i in range(0, len(varName), 1):
    mean_li.append(train_set[varName[i]].mean())
    std_li.append(train_set[varName[i]].std())

train_set = train_set.copy()
test_set = test_set.copy()

for i in range(0, len(varName), 1):
    train_set.loc[:, varName[i]] = (train_set[varName[i]].copy() - mean_li[i] + 1.0) / std_li[i]
    test_set.loc[:, varName[i]] = (test_set[varName[i]].copy() - mean_li[i] + 1.0) / std_li[i]



In [5]:
def compute_distances(P, C):
    A = (P**2).sum(axis=1, keepdims=True)
 
    B = (C**2).sum(axis=1, keepdims=True).T
 
    return np.sqrt(A + B - 2* np.dot(P, C.T))

In [4]:
test_set

Unnamed: 0,date,lon,lat,fCO2,Chl,Temp,Salt,month
132516,2010/7/16,173.25,55.25,0.379430,1.281975,-1.738287,0.386586,7
132520,2010/7/16,-168.75,66.75,-2.759659,2.232367,-1.903447,0.381619,7
132524,2010/7/16,173.75,56.75,0.305969,1.245255,-1.702248,0.349873,7
132528,2010/7/16,174.75,56.75,-0.467146,1.222401,-1.791050,0.301272,7
132532,2010/7/16,174.75,57.25,-0.404352,1.380749,-1.741096,0.270506,7
...,...,...,...,...,...,...,...,...
263880,2020/7/16,137.75,13.25,0.921513,0.937207,1.387684,-0.164779,7
263884,2020/7/16,137.25,12.75,0.869110,0.934032,1.396153,-0.232795,7
263888,2020/7/16,137.75,8.75,0.640634,0.961555,1.416420,-0.236239,7
263892,2020/7/16,137.25,14.25,0.956775,0.937405,1.387597,-0.284868,7


In [30]:
def process_df(my_set, varName):
    temp_df = pd.DataFrame()


    dataset = my_set.reset_index(drop=True)
    ycor = dataset.lat
    #ycor = dataset.lon


    temp_df['label'] = dataset[varName[0]]

    temp_df['beta'] = np.ones(dataset.shape[0])

    temp_df[varName[1:4]] = dataset[varName[1:4]]

    alist = dataset.lon
    temp = []
    for i in alist:
        if i < 0:
            i = i+360
        temp.append(i)
    xcor = temp

    cor_df = pd.DataFrame()
    cor_df['xcor'] = xcor
    cor_df['ycor'] = ycor

    a = [[110.0, 0.0], [290.0,0.0], [110.0, 70.0], [290.0, 70.0]]
    b = np.array(a)

    cor_li = cor_df.to_numpy()
    dis_li = compute_distances(cor_li, b)
    dis_df = pd.DataFrame(dis_li)
    temp_df = temp_df.join(dis_df)

    return temp_df



train_data = MYDataset(process_df(my_set=train_set, varName=varName))
test_data = MYDataset(process_df(my_set=test_set, varName=varName))
train_loader = DataLoader(train_data, batch_size=50, shuffle=True, num_workers=0, drop_last=True)
test_loader = DataLoader(test_data, batch_size=50, shuffle=False, num_workers=0)

In [31]:
process_df(my_set=train_set, varName=varName)

Unnamed: 0,label,beta,Chl,Temp,Salt,0,1,2,3
0,1.147607,1.0,1.083605,0.917563,3.877010,141.510159,45.990488,147.682514,62.450981
1,1.575923,1.0,1.002562,0.700559,3.710522,139.698336,48.224734,145.466921,63.012896
2,1.498905,1.0,1.269272,0.578081,3.584122,139.868242,48.714731,145.148631,62.274594
3,1.466422,1.0,1.039838,0.474396,3.564867,138.971670,49.830964,144.042095,62.594928
4,0.991787,1.0,0.983632,0.288800,3.383090,138.078329,50.947277,142.935737,62.933497
...,...,...,...,...,...,...,...,...,...
5278,0.931376,1.0,0.898172,1.383237,-0.163022,30.751016,152.825472,63.171394,162.482691
5279,0.878505,1.0,0.895126,1.391705,-0.231174,30.085295,153.281196,63.404456,163.126102
5280,0.647988,1.0,0.921531,1.411966,-0.234624,29.096821,152.501230,67.243029,164.108577
5281,0.966953,1.0,0.898361,1.383150,-0.283350,30.751016,153.413249,62.053404,162.605735


In [25]:
class GNNWR(nn.Module):
    def __init__(self, insize, outsize):
        super(GNNWR, self).__init__()
        self.insize = insize
        self.outsize = outsize

        lastsize = self.insize
        thissize = 0
        self.fc = nn.Sequential()
        i = 2

        self.fc.add_module("full"+str(1), nn.Linear(4, 600))
        # self.fc.add_module("batc"+str(1), nn.BatchNorm1d(600))
        # self.fc.add_module("acti"+str(1), nn.PReLU(init=0.4))
        # self.fc.add_module("drop"+str(1), nn.Dropout(0.2))

        lastsize = 600
        while math.pow(2, int(math.log2(lastsize))) >= max(128, outsize + 1):
            if i == 1:
                thissize = int(math.pow(2, int(math.log2(lastsize))))
            else:
                thissize = int(math.pow(2, int(math.log2(lastsize)) - 1))
            
            self.fc.add_module("full"+str(i), nn.Linear(lastsize, thissize))
            self.fc.add_module("batc"+str(i), nn.BatchNorm1d(thissize))
            self.fc.add_module("acti"+str(i), nn.PReLU(init=0.4))
            
            self.fc.add_module("drop"+str(i), nn.Dropout(0.2))

            lastsize = thissize
            i = i + 1

        self.fc.add_module("full"+str(i), nn.Linear(lastsize, outsize))
        
    def forward(self, x):
        x = self.fc(x)
        return x

model = GNNWR(623, 4)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [26]:


r2 = 0
weightlist = []
for i in range(1,2):
    temp = []
    temp.append(-0.172075)
    temp.append(-0.175203)
    temp.append(0.294790)
    temp.append(0.385374)
    weightlist.append(temp)
out = nn.Linear(4, 1, bias = False)
out.weight = nn.Parameter(torch.tensor(weightlist), requires_grad=False)

def train(epoch):
    model.train()
    train_loss = 0
    global r2
    global out
    for data, coef, label in train_loader:
        data = data.view(data.shape[0], -1)
        label = label.view(data.shape[0], -1)
        optimizer.zero_grad()

        output = model(data)
        output = output.mul(coef)
        output = out(output)

        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        a = output.view(-1).detach().numpy()
        b = label.view(-1).numpy()
        if epoch % 100 == 0:
            r2 = r2_score(a, b)

        train_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(train_loader.dataset)
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch, train_loss))

def val(epoch):
    model.eval()
    global out
    global r2
    val_loss = 0

    label_li = np.array([])
    out_li = np.array([])

    with torch.no_grad():
        for data, coef, label in test_loader:
            data = data.view(data.shape[0], -1)
            label = label.view(data.shape[0], -1)

            output = model(data)
            output = output.mul(coef)
            output = out(output)

            loss = criterion(output, label)

            a = output.view(-1).detach().numpy()
            b = label.view(-1).numpy()
            out_li = np.append(out_li, a)
            label_li = np.append(label_li, b)
            

            val_loss += loss.item()*data.size(0)
        val_loss = val_loss/len(test_loader.dataset)
        label_li = np.array(label_li).reshape(-1)
        out_li = np.array(out_li).reshape(-1)
        if epoch % 100 == 0:
            r2 = r2_score(out_li, label_li)
        #print(out_li)
        print('Epoch: {} \tTraining Loss: {:.6f} \tR2: {:.6f}'.format(epoch, val_loss, r2))
        

In [32]:
for epoch in tqdm(range(1, 1000+1)):
    train(epoch)
    val(epoch)
    if epoch % 2 ==0:
         clear()


 11%|█         | 106/1000 [01:17<10:57,  1.36it/s]


KeyboardInterrupt: 

In [28]:
for data, coef, label in train_loader:
    print(data.view(50,-1))

tensor([[139.7878,  77.9463, 154.8729],
        [ 92.2801, 101.1960,  89.3903],
        [146.8728,  48.8531, 144.3490],
        [149.1446,  47.2136, 146.8983],
        [ 65.4647, 128.5715,  68.8522],
        [139.3687,  76.0830, 153.3578],
        [141.5914,  53.3210, 138.9717],
        [ 95.4234,  98.0338,  91.8729],
        [102.5725,  94.1070, 112.8101],
        [105.5728,  89.7810, 112.4750],
        [135.9527,  59.1450, 134.7892],
        [ 67.5731, 126.8902,  68.3456],
        [153.8770,  68.9429, 166.4426],
        [ 95.2162,  98.1892,  92.0387],
        [ 98.2503,  95.5935, 103.2866],
        [ 94.5549,  98.8212,  91.7367],
        [101.0105,  92.5101,  95.8547],
        [ 97.7247,  95.5255,  99.6751],
        [147.2859,  74.3514, 161.6729],
        [119.4702,  80.3625,  99.8405],
        [149.2753,  73.9468, 163.7013],
        [ 90.3528, 106.2950,  78.5406],
        [129.2115,  64.0361, 120.9571],
        [137.1373,  57.3727, 134.6909],
        [ 93.3173, 100.3151,  98.9602],
