In [40]:
import numpy as np
import pandas as pd
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils import data
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch.optim import lr_scheduler
from torch.optim import lr_scheduler
import time
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.utils.data import random_split

global cr
cr = 0.96


def normalization(x: list):
    M, m = np.max(x), np.min(x)
    for i in range(len(x)):
        x[i] = (x[i] - (M + m) / 2) / ((M - m) / 2)
    # x in [-1, 1]
    return M, m, x

def ArrNorm(x: np.ndarray):
    assert isinstance(x, np.ndarray), "We need a list"
    M_list, m_list, res = [], [], []
    for i in range(x.shape[0]):
        u = x[i].tolist()
        M, m, t = normalization(u)
        res.append(t)
        M_list.append(M)
        m_list.append(m)
    return M_list, m_list, np.array(res)


def df2arr(x) -> np.ndarray:
    return np.array(x, dtype=np.float32)



# one train demo

class Try(nn.Module):
    def __init__(self, seq, batch_size, scale=0):
        super(Try, self).__init__()
        self.scale = scale
        self.seq = seq
        self.batch_size = batch_size
        self.linear = nn.Sequential(
            nn.Linear((self.seq+1)*12, (self.seq+1)*6),
            nn.Dropout(0.5),
            nn.Sigmoid(),

            nn.Linear((self.seq+1)*6, (self.seq+1)*6),
            nn.Dropout(0.5),
            nn.ReLU(inplace=True),

        )
        self.conv1 = nn.Sequential(
            # seq * 5 
            nn.Conv2d(1, 2, kernel_size=(3,3), padding=2, bias=False), # (seq+2) * 7
            nn.BatchNorm2d(2),
            nn.ReLU(inplace=True),

            nn.Conv2d(2, 2, kernel_size=(3,3), padding=1, bias=False), # (seq+2) * 7
            nn.BatchNorm2d(2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2,2), stride=1), # (seq+1) * 6
            
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(1, 30, kernel_size=(2,2), padding=0, bias=False), # seq * 5
            nn.BatchNorm2d(30),
            nn.ReLU(inplace=True),

            nn.Conv2d(30, 1, kernel_size=(1,1), padding=0, bias=True), # seq * 5
            nn.ReLU(inplace=True)

        )

    def forward(self, x):
        out = self.conv1(x)
        out = out.view(out.size()[0], -1)
        out = self.linear(out)
        # print(out.shape)
        with torch.no_grad():
            out = out.reshape(self.batch_size, 1, self.seq+1, 6)
        out = self.conv2(out)
        assert out.shape==x.shape, "Shape Unequal Error."
        return out + x
    
    

import math
import numpy as np
from scipy import stats
def rsquared(x, y): 
    assert x.shape==y.shape, "Unequal Shape Error"
    r, x, y = [], x.detach(), y.detach()
    x, y = x.mean(dim=[0,1], keepdim=False), y.mean(dim=[0,1], keepdim=False)
    for i in range(x.shape[-1]):
        _, _, r_value, _, _ = stats.linregress(x[:][i].detach().numpy(), y[:][i].detach().numpy()) 
        r.append(r_value ** 2)
    return r


In [41]:
excel = pd.read_excel('/kaggle/input/a32-data/A32.xlsx', header=None)
excel.shape

(4459, 18)

In [42]:
sp = [1486, 2972, 4458]
station_1 = excel.iloc[1:sp[0]+1,1:6]
station_2 = excel.iloc[sp[0]+1:sp[1]+1,1:6]
standard = excel.iloc[sp[1]+1:sp[2]+1,1:6]
standard.shape

(1486, 5)

In [43]:
station_1 = df2arr(station_1)
station_2 = df2arr(station_2)
standard = df2arr(standard)
station_1.shape, station_2.shape, standard.shape

((1486, 5), (1486, 5), (1486, 5))

In [44]:
s1_minus_sd = station_1 - standard
s2_minus_sd = station_2 - standard
s1_div_sd = station_1 / standard
s2_div_sd = station_2 / standard

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [45]:
s1_minus_sd.shape, s2_minus_sd.shape

((1486, 5), (1486, 5))

In [46]:
s1_M, s1_m, s1 = ArrNorm(station_1)
s2_M, s2_m, s2 = ArrNorm(station_2)

In [47]:
def GetDataset(input_arr: list, output_arr: list, seq: int):
    assert(len(input_arr)==len(output_arr)), "Different size of input and output!"
    Input = []
    Output = []
    for i in range(input_arr.shape[0]-seq):
        Input.append(input_arr[i:i+seq][:])
        Output.append(output_arr[i:i+seq][:])
    return torch.tensor(Input, dtype=torch.float32), torch.tensor(Output, dtype=torch.float32)

        
def load_array(data_arrays, batch_size, is_train=True):
    # data-iter
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

s1_minus_sd.shape

(1486, 5)

In [48]:
# global sequence, batch_size
# batch_size = 4
# sequence = 6

# global lr, num_epoch
# lr, num_epoch = 0.000001, 100
# status_list = ['MINUS', 'DIVIDE']
status_list = ["MINUS"]
global record
record_R = []
record_con = []

def train(status, batch_size, sequence, lr, num_epoch, ga):
    string = "&"+str(status)+"&batch_size="+str(batch_size)+"&sequence="+str(sequence)+"&lr="+str(lr)+"&num_epoch"+str(num_epoch)+"&gama="+str(ga)+"&"
    
    Input_Data_1, Output_Data_1 = GetDataset(s1, s1_minus_sd, sequence) if status=="MINUS" else GetDataset(s1,s1_div_sd, sequence)
    Input_Data_2, Output_Data_2 = GetDataset(s2, s2_minus_sd, sequence) if status=="MINUS" else GetDataset(s2,s2_div_sd, sequence)
    Input_Data_1.shape, Input_Data_2.shape

    Input_Data_1 = Input_Data_1.unsqueeze(1)
    Output_Data_1 = Output_Data_1.unsqueeze(1)
    Input_Data_2 = Input_Data_2.unsqueeze(1)
    Output_Data_2 = Output_Data_2.unsqueeze(1)
    Input_Data_1.shape, Input_Data_1.shape


    data_tot_1 = torch.utils.data.TensorDataset(Input_Data_1, Output_Data_1)
    data_tot_2 = torch.utils.data.TensorDataset(Input_Data_2, Output_Data_2)
    train_size = int(Input_Data_1.shape[0] * cr)
    test_size = Input_Data_1.shape[0] - train_size
    train_set_1, test_set_1 = random_split(data_tot_1,[train_size,test_size],
                                        torch.Generator().manual_seed(0))
    train_set_2, test_set_2 = random_split(data_tot_2,[train_size,test_size],
                                        torch.Generator().manual_seed(0))
    # DataIter = load_array((Input_Data_1, Output_Data_1), batch_size=8)




    Data_Iter_1 = DataLoader(dataset=train_set_1, batch_size=batch_size, shuffle=True, drop_last=True)
    Data_Iter_2 = DataLoader(dataset=train_set_2, batch_size=batch_size, shuffle=True, drop_last=True)

    for i, dt in enumerate(Data_Iter_1):
        if dt[0].shape[0]!=batch_size:
            print(dt[0].shape)
            print(i, batch_size, dt[0].shape[0], dt)

    for i, dt in enumerate(Data_Iter_2):
        if dt[0].shape[0]!=batch_size:
            print(dt[0].shape)
            print(i, batch_size, dt[0].shape[0], dt)

    x_plt, train_loss_plt = [], []


    net = Try(batch_size=batch_size, seq=sequence).cuda()
    Loss = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(net.parameters(), lr)
    scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=ga)

    def Iter(num_epoch):
        cnt = 0
        while cnt < num_epoch:
            yield cnt
            cnt += 1

    print("\nStart Training with condition: "+string)
    print("Round 1...")
    for epoch in range(num_epoch):
        epoch_start_time = time.time()
        train_loss = 0.0
        net.train()
        for i, use in enumerate(tqdm(Data_Iter_1)):
            optimizer.zero_grad()
            # if use[0].shape[0]==2:
            #     print(use[0])
            train_pred = net(use[0].cuda())    # use[0].cuda()

            batch_loss = Loss(train_pred, use[1].cuda())   # use[1].cuda()
            batch_loss.backward()
            optimizer.step()
#             R2 = R_square(train_pred.cpu(), use[1].cpu())

            train_loss += batch_loss.item()

        train_loss = train_loss / train_size
        x_plt.append(epoch+1)
        train_loss_plt.append(train_loss)
        print("Round-1 --- [%2d|%2d] %.2f(s) Train_Loss=%.6f "%\
                (epoch+1,num_epoch,time.time()-epoch_start_time,train_loss),end='')
    #   epoch_start_time = time.time()
        scheduler.step()  

    plt.figure(1)
    plt.plot(x_plt,train_loss_plt,'rs-',label='all_train_loss')
    plt.show()

    print("Round 2...")
    for epoch in range(num_epoch):
        epoch_start_time = time.time()
        train_loss = 0.0
        net.train()
        for i, use in enumerate(tqdm(Data_Iter_2)):
            optimizer.zero_grad()
            # if use[0].shape[0]==2:
            #     print(use[0])
            train_pred = net(use[0].cuda())    # use[0].cuda()

            batch_loss = Loss(train_pred, use[1].cuda())   # use[1].cuda()
            batch_loss.backward()
            optimizer.step()
            # R2 = rsquared(train_pred.cpu(), use[1].cpu())

            train_loss += batch_loss.item()

        train_loss = train_loss / train_size
        x_plt.append(epoch+num_epoch+1)
        train_loss_plt.append(train_loss)
        print("Round 2 --- [%2d|%2d] %.2f(s) Train_Loss=%.6f "%\
                (epoch+1,num_epoch,time.time()-epoch_start_time,train_loss),end='')
    #   epoch_start_time = time.time()
        scheduler.step() 


    plt.figure(2)
    plt.plot(x_plt,train_loss_plt,'rs-',label='all_train_loss')
    plt.show()

    torch.save(net.state_dict(), '/kaggle/working/'+string+".pt")
    print("Parameters Saved.")


    Test_Iter_1 = DataLoader(dataset=test_set_1, batch_size=1, shuffle=False, drop_last=True)
    model = Try(batch_size=1, seq=sequence)
    model.load_state_dict(torch.load('/kaggle/working/'+string+".pt"))
    model = model.cuda()
    model.eval()
    R_list = [[],[],[],[],[]]
    def app(R: list)-> list:
        assert len(R)==5
        for i in range(len(R_list)):
            R_list[i].append(R[i])

    with torch.no_grad():
        for i, use in enumerate(Test_Iter_1):
            pred = model(use[0].cuda())
            R = rsquared(pred.cpu(), use[1])
            print(i, R)
            for i in range(5):
                R_list[0].append(R)
                           
                           
    Test_Iter_2 = DataLoader(dataset=test_set_2, batch_size=1, shuffle=False, drop_last=True)

    with torch.no_grad():
        for i, use in enumerate(Test_Iter_2):
            pred = model(use[0].cuda())
    #         print(pred.shape, use[1].shape)
            R = rsquared(pred.cpu(), use[1])
            print(i, R)
            for i in range(5):
                R_list[0].append(R)
    
    record_R.append(R_list)
    record_con.append(string)




In [49]:
# train('MINUS',10,6,0.0001,5,0.9)

In [50]:
# record_R, record_con

In [51]:
for status in status_list:
    for num_epoch in [500,1000,1500]:
        for ga in [0.80,0.85,0.90]:
            for batch_size in [2,4,8,16,32,64]:
                for lr in [0.00001,0.0001,0.0001,0.001,0.03]:
                    for sequence in [4,5,6,7,8,9,10]:
                        train(status, batch_size, sequence, lr, num_epoch, ga)


Start Training with condition: &MINUS&batch_size=2&sequence=4&lr=1e-05&num_epoch500&gama=0.8&
Round 1...


100%|██████████| 711/711 [00:01<00:00, 381.90it/s]


Round-1 --- [ 1|500] 1.87(s) Train_Loss=111.250899 

 75%|███████▍  | 531/711 [00:01<00:00, 380.45it/s]


KeyboardInterrupt: 