In [1]:
import os
import random
from Mydataset import MYDataset
import math
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import r2_score
from IPython.display import clear_output as clear
import statsmodels.api as sm
from SWNN import SWNN

In [2]:
varName = ['fCO2', 'Chl', 'Temp', 'Salt']

dataset = pd.read_csv("D://CO2_data5.csv", encoding="utf-8")
dataset = dataset.dropna()
dataset = dataset[dataset.index % 4 == 0]

df0 = dataset['date'].str.split("/",expand = True)
df0.columns = ['year', 'month', 'date']

dataset['month'] = df0['month']
dataset = dataset[dataset.month == '7']

train_li = random.sample([i for i in range(0, dataset.shape[0])], int(0.8 * dataset.shape[0]))
train_li.sort()

test_li = list(set([i for i in range(0, dataset.shape[0])]) - set(train_li))
test_li.sort()

train_set = dataset.iloc[train_li, :]
test_set  = dataset.iloc[test_li,  :]

mean_li = []
std_li = []

for i in range(0, len(varName), 1):
    mean_li.append(train_set[varName[i]].mean())
    std_li.append(train_set[varName[i]].std())

train_set = train_set.copy()
test_set = test_set.copy()

for i in range(0, len(varName), 1):
    train_set.loc[:, varName[i]] = (train_set[varName[i]] - mean_li[i] + 1.0) / std_li[i]
    test_set.loc[:, varName[i]] = (test_set[varName[i]] - mean_li[i] + 1.0) / std_li[i]

del mean_li, std_li


In [3]:
def compute_distances(P, C):
    A = (P**2).sum(axis=1, keepdims=True)
    B = (C**2).sum(axis=1, keepdims=True).T
 
    return np.sqrt(A + B - 2* np.dot(P, C.T))

In [6]:
len(test_li)

1621

In [4]:
def process_df(my_set, varName):
    temp_df = pd.DataFrame()

    dataset = my_set.reset_index(drop=True)
    ycor = dataset.lat

    temp_df['label'] = dataset[varName[0]]
    temp_df['beta'] = np.ones(dataset.shape[0])
    temp_df[varName[1:4]] = dataset[varName[1:4]]

    alist = dataset.lon
    temp = []
    for i in alist:
        if i < 0:
            i = i+360
        temp.append(i)
    xcor = temp

    cor_df = pd.DataFrame()
    cor_df['xcor'] = xcor
    cor_df['ycor'] = ycor

    sample_pt = np.array([[110.0, 0.0], [290.0,0.0], [110.0, 70.0], [290.0, 70.0]])

    cor_li = cor_df.to_numpy()
    dis_li = compute_distances(cor_li, sample_pt)
    dis_df = pd.DataFrame(dis_li)
    temp_df = temp_df.join(dis_df)

    return temp_df



train_data = MYDataset(process_df(my_set=train_set, varName=varName), len(varName))
test_data = MYDataset(process_df(my_set=test_set, varName=varName), len(varName))
train_loader = DataLoader(train_data, batch_size=128, shuffle=True, num_workers=0, drop_last=True)
test_loader = DataLoader(test_data, batch_size=128, shuffle=False, num_workers=0)

In [5]:
relation = str()
relation = varName[0]+"~" + "+".join(varName[1:len(varName)])
fit=sm.formula.ols(relation,data=train_set).fit()

In [6]:
model = SWNN(outsize=4)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [27]:
varName[0]+"+" + "~".join(varName[1:len(varName)])

'fCO2+Chl~Temp~Salt'

In [7]:
r2 = 0
weightlist = [[]]
temp = []
for j in fit.params:
    weightlist[0].append(j)
out = nn.Linear(4, 1, bias = False)
out.weight = nn.Parameter(torch.tensor(weightlist), requires_grad=False)

def train(epoch):
    model.train()
    train_loss = 0
    global r2
    global out
    for data, coef, label in train_loader:
        data = data.view(data.shape[0], -1)
        label = label.view(data.shape[0], -1)
        optimizer.zero_grad()

        output = model(data)
        output = output.mul(coef)
        output = out(output)

        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        a = output.view(-1).detach().numpy()
        b = label.view(-1).numpy()
        if epoch % 100 == 0:
            r2 = r2_score(a, b)

        train_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(train_loader.dataset)
    print('\r Epoch: {} \tTraining Loss: {:.6f}'.format(epoch, train_loss))

def val(epoch):
    model.eval()
    global out
    global r2
    val_loss = 0

    label_li = np.array([])
    out_li = np.array([])

    with torch.no_grad():
        for data, coef, label in test_loader:
            data = data.view(data.shape[0], -1)
            label = label.view(data.shape[0], -1)

            output = model(data)
            output = output.mul(coef)
            output = out(output)

            loss = criterion(output, label)

            a = output.view(-1).detach().numpy()
            b = label.view(-1).numpy()
            out_li = np.append(out_li, a)
            label_li = np.append(label_li, b)

            val_loss += loss.item()*data.size(0)
        val_loss = val_loss/len(test_loader.dataset)
        label_li = np.array(label_li).reshape(-1)
        out_li = np.array(out_li).reshape(-1)
        if epoch % 100 == 0:
            r2 = r2_score(out_li, label_li)
        print('\r Epoch: {} \tTraining Loss: {:.6f} \tR2: {:.6f}'.format(epoch, val_loss, r2))
        if epoch % 2 == 0:
            clear()
        

In [9]:
for epoch in tqdm(range(1, 1000+1)):
    train(epoch=epoch)
    val(epoch=epoch)

100%|██████████| 1000/1000 [12:31<00:00,  1.33it/s]


In [None]:
for data, coef, label in train_loader:
    print(label.view(50,-1))