In [5]:
import os
import random
import math
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import r2_score
from IPython.display import clear_output as clear
import statsmodels.api as sm

In [6]:
class MYDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.images = df.iloc[:,5:].values
        self.coef = df.iloc[:,1:5].values
        self.labels = df.iloc[:, 0].values
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        coef = self.coef[idx]
        
        image = torch.tensor(image, dtype=torch.float)
        label = torch.tensor(label, dtype=torch.float)
        coef = torch.tensor(coef, dtype=torch.float)

        return image, coef, label

In [36]:
varName = ['fCO2', 'Chl', 'Temp', 'Salt']

dataset = pd.read_csv("D://CO2_data4.csv", encoding="utf-8")
dataset = dataset.dropna()
dataset = dataset[dataset.index % 4 == 0]

df0 = dataset['date'].str.split("/",expand = True)
df0.columns = ['year', 'month', 'date']

dataset['month'] = df0['month']
dataset = dataset[dataset.month == '7']

train_li = random.sample([i for i in range(0, dataset.shape[0])], int(0.8 * dataset.shape[0]))
train_li.sort()

j = 0
test_li = []

for i in range(0, dataset.shape[0], 1):
    if i != train_li[j] | j >= len(train_li):
        test_li.append(i)
    else:
        j = j + 1

train_set = dataset.iloc[train_li, :]
test_set  = dataset.iloc[test_li,  :]

mean_li = []
std_li = []

for i in range(0, len(varName), 1):
    mean_li.append(train_set[varName[i]].mean())
    std_li.append(train_set[varName[i]].std())

for i in range(0, len(varName), 1):
    train_set.loc[:, varName[i]] = (train_set[varName[i]].copy() - mean_li[i] + 1.0) / std_li[i]
    test_set.loc[:, varName[i]] = (test_set[varName[i]].copy() - mean_li[i] + 1.0) / std_li[i]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set.loc[:, varName[i]] = (train_set[varName[i]].copy() - mean_li[i] + 1.0) / std_li[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set.loc[:, varName[i]] = (test_set[varName[i]].copy() - mean_li[i] + 1.0) / std_li[i]


In [5]:
def compute_distances(P, C):
    A = (P**2).sum(axis=1, keepdims=True)
 
    B = (C**2).sum(axis=1, keepdims=True).T
 
    return np.sqrt(A + B - 2* np.dot(P, C.T))

In [26]:
test_set

Unnamed: 0,date,lon,lat,fCO2,Chl,Temp,Salt,month
12,1998/7/16,-116.75,28.25,1.241139,0.745263,0.936183,2.553028,7
28,1998/7/16,131.75,36.25,0.141260,0.905653,1.407171,1.894994,7
52,1998/7/16,145.25,41.75,-0.590107,0.939269,0.535674,1.684783,7
56,1998/7/16,170.75,37.75,0.578059,0.832415,0.870099,1.648535,7
72,1998/7/16,136.25,15.75,0.980314,0.667182,2.250066,1.510432,7
...,...,...,...,...,...,...,...,...
6368,2020/7/16,-178.25,43.25,0.770155,1.613718,0.046663,1.140538,7
6388,2020/7/16,-129.75,35.25,2.345260,0.713546,0.650646,0.983335,7
6432,2020/7/16,151.25,3.25,1.293473,0.670695,2.345796,0.513952,7
6440,2020/7/16,-171.75,50.75,1.726052,0.842360,-0.444046,0.418292,7


In [7]:
def process_df(my_set):
    temp_df = pd.DataFrame()


    dataset = my_set.reset_index(drop=True)
    ycor = dataset.lat
    #ycor = dataset.lon
    label = dataset.fCO2

    temp_df['label'] = label

    temp_df['beta'] = np.ones(dataset.shape[0])
    temp_df['Chl'] = dataset.Chl
    temp_df['Temp'] = dataset.Temp
    temp_df['Salt'] = dataset.Salt

    alist = dataset.lon
    temp = []
    for i in alist:
        if i < 0:
            i = i+360
        temp.append(i)
    xcor = temp

    cor_df = pd.DataFrame()
    cor_df['xcor'] = xcor
    cor_df['ycor'] = ycor

    a = [[110.0, 0.0], [290.0,0.0], [110.0, 70.0], [290.0, 70.0]]
    b = np.array(a)

    cor_li = cor_df.to_numpy()
    dis_li = compute_distances(cor_li, b)
    dis_df = pd.DataFrame(dis_li)
    temp_df = temp_df.join(dis_df)

    return temp_df



train_data = MYDataset(process_df(my_set=train_set))
test_data = MYDataset(process_df(my_set=test_set))
train_loader = DataLoader(train_data, batch_size=50, shuffle=True, num_workers=0, drop_last=True)
test_loader = DataLoader(test_data, batch_size=50, shuffle=False, num_workers=0)

In [8]:

dataset

Unnamed: 0,date,lon,lat,fCO2,Chl,Temp,Salt
0,1998/7/16,-110.25,22.25,1.429020,0.932204,1.035501,3.714037
1,1998/7/16,-111.75,22.25,0.653314,0.865872,0.980157,3.665900
2,1998/7/16,-111.25,23.25,1.835711,0.874641,0.896934,3.617013
3,1998/7/16,-112.25,23.25,1.925815,0.861615,0.827733,3.544569
4,1998/7/16,-112.25,24.25,1.836483,1.093919,0.710469,3.415906
...,...,...,...,...,...,...,...
6599,2020/7/16,137.75,13.25,1.178217,0.770692,1.481354,-0.398306
6600,2020/7/16,137.25,12.75,1.116893,0.768039,1.489461,-0.467678
6601,2020/7/16,137.75,8.75,0.849520,0.791038,1.508860,-0.471190
6602,2020/7/16,137.25,14.25,1.219482,0.770857,1.481271,-0.520788


In [9]:
class GNNWR(nn.Module):
    def __init__(self, insize, outsize):
        super(GNNWR, self).__init__()
        self.insize = insize
        self.outsize = outsize

        lastsize = self.insize
        thissize = 0
        self.fc = nn.Sequential()
        i = 2

        self.fc.add_module("full"+str(1), nn.Linear(4, 600))
        # self.fc.add_module("batc"+str(1), nn.BatchNorm1d(600))
        # self.fc.add_module("acti"+str(1), nn.PReLU(init=0.4))
        # self.fc.add_module("drop"+str(1), nn.Dropout(0.2))

        lastsize = 600
        while math.pow(2, int(math.log2(lastsize))) >= max(128, outsize + 1):
            if i == 1:
                thissize = int(math.pow(2, int(math.log2(lastsize))))
            else:
                thissize = int(math.pow(2, int(math.log2(lastsize)) - 1))
            
            self.fc.add_module("full"+str(i), nn.Linear(lastsize, thissize))
            self.fc.add_module("batc"+str(i), nn.BatchNorm1d(thissize))
            self.fc.add_module("acti"+str(i), nn.PReLU(init=0.4))
            
            self.fc.add_module("drop"+str(i), nn.Dropout(0.2))

            lastsize = thissize
            i = i + 1

        self.fc.add_module("full"+str(i), nn.Linear(lastsize, outsize))
        
    def forward(self, x):
        x = self.fc(x)
        return x

model = GNNWR(623, 4)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [18]:


r2 = 0
weightlist = []
for i in range(1,2):
    temp = []
    temp.append(-0.172075)
    temp.append(-0.175203)
    temp.append(0.294790)
    temp.append(0.385374)
    weightlist.append(temp)
out = nn.Linear(4, 1, bias = False)
out.weight = nn.Parameter(torch.tensor(weightlist), requires_grad=False)

def train(epoch):
    model.train()
    train_loss = 0
    global r2
    global out
    for data, coef, label in train_loader:
        data = data.view(data.shape[0], -1)
        label = label.view(data.shape[0], -1)
        optimizer.zero_grad()

        output = model(data)
        output = output.mul(coef)
        output = out(output)

        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        a = output.view(-1).detach().numpy()
        b = label.view(-1).numpy()
        if epoch % 100 == 0:
            r2 = r2_score(a, b)

        train_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(train_loader.dataset)
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch, train_loss))

def val(epoch):
    model.eval()
    global out
    global r2
    val_loss = 0

    label_li = np.array([])
    out_li = np.array([])

    with torch.no_grad():
        for data, coef, label in test_loader:
            data = data.view(data.shape[0], -1)
            label = label.view(data.shape[0], -1)

            output = model(data)
            output = output.mul(coef)
            output = out(output)

            loss = criterion(output, label)

            a = output.view(-1).detach().numpy()
            b = label.view(-1).numpy()
            out_li = np.append(out_li, a)
            label_li = np.append(label_li, b)
            

            val_loss += loss.item()*data.size(0)
        val_loss = val_loss/len(test_loader.dataset)
        label_li = np.array(label_li).reshape(-1)
        out_li = np.array(out_li).reshape(-1)
        if epoch % 100 == 0:
            r2 = r2_score(out_li, label_li)
        #print(out_li)
        print('Epoch: {} \tTraining Loss: {:.6f} \tR2: {:.6f}'.format(epoch, val_loss, r2))
        

In [21]:
for epoch in tqdm(range(1, 1000+1)):
    train(epoch)
    val(epoch)
    if epoch % 2 ==0:
         clear()


 24%|██▎       | 237/1000 [03:06<11:43,  1.08it/s]

Epoch: 237 	Training Loss: 0.586746
Epoch: 237 	Training Loss: 0.502367 	R2: 0.334928


 24%|██▎       | 237/1000 [03:07<10:03,  1.26it/s]

Epoch: 238 	Training Loss: 0.598425





KeyboardInterrupt: 

In [53]:
len(test_loader.dataset)

1321