In [None]:
!pip install gdown

!if [ ! -e "stocks" ]; then \
  gdown --id '1g5c7kmbf2Xqp8O6ghPxmx0RaOjWc6SBu' --output "stocks.zip" ;\
  unzip -q "stocks.zip" ;\
fi

In [None]:
!nvidia-smi

In [None]:
!if [ ! -e "stock_num.txt" ]; then \
  gdown --id '18jgYbCxvOzHWVwwFK3G-o1yU34GUnuj1' --output "stock_num.txt" ;\
fi

In [None]:
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# For Progress Bar
from tqdm import tqdm

# Pytorch
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset, Subset, SubsetRandomSampler

# KFold
# KFOLD Reference: https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-use-k-fold-cross-validation-with-pytorch.md
from sklearn.model_selection import KFold

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter

In [None]:
stock_num = list()
with open('stock_num.txt') as f:
    lines = f.readlines()
    for line in lines:
        stock_num.append(line.strip())

In [None]:
stock_num_mod = []
for i in range(len(stock_num)):
    path = 'stocks/' + stock_num[i] + '.csv'  
    try:
        stock = pd.read_csv(path)
        if len(stock) >= 500:
            stock_num_mod.append(stock_num[i])
    except:
        print(path, " is empty and has been skipped.")    

print(len(stock_num_mod))

In [None]:
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    valid_set_size = int(valid_ratio * len(data_set)) 
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

def predict(test_loader, model, device):
    model.eval() # Set your model to evaluation mode.
    preds = []
    for x in tqdm(test_loader):
        x = x.to(device)                        
        with torch.no_grad():                   
            pred = model(x)                     
            preds.append(pred.detach().cpu())   
    preds = torch.cat(preds, dim=0).numpy()  
    return preds

In [None]:
class My_Model(nn.Module):
    def __init__(self, input_dim):
        super(My_Model, self).__init__()        
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 8),            
            nn.ReLU(),            
            #nn.Linear(16, 8),
            #nn.ReLU(),
            nn.Linear(8, 4),
            nn.ReLU(),
            #nn.Linear(4, 2),
            #nn.ReLU(),
            nn.Linear(4, 1)
        )

    def forward(self, x):
        x = self.layers(x)
        #x = x.squeeze(1) # (B, 1) -> (B)
        #for i in range(len(x)):
        #    if x[i] > 0:
            #if x[i] > 0.006:
        #        x[i] = 1
        #    else: x[i] = 0
        return x

In [None]:
class Stock:
    def __init__(self, *stock_numbers):
        self.stock_numbers = stock_numbers    
    def scrape(self):

        for stock_number in self.stock_numbers:
            path = 'stocks/' + stock_number + '.csv'           
            data = pd.read_csv(path)
            
            incl = list()
            prel = list()
            max9 = list()
            min9 = list()            
            rsvl = list()
            kl = list()
            dl = list()
            jl = list()
            avgl = list()
            biasl = list()
            for i in range(8):
                max9.append("0")
                min9.append("0")                
                rsvl.append("0")
                
            for i in range(7):
                kl.append("0")
                dl.append("0")
                jl.append("0")
            
            incl.append("0")
            avgl.append("0")
            biasl.append("0")
                
            kl.append("50")
            dl.append("50")
            jl.append("50")
                
            for i in range(len(data)-8):
                dmax = data[['max']].iloc[i:i+9].max().to_string(index=False)
                max9.append(dmax)
                    
                dmin = data[['min']].iloc[i:i+9].min().to_string(index=False)
                min9.append(dmin)
            
            for i in range(8, len(data)):
                close = data[['close']].iloc[i].to_string(index=False)
                if float(max9[i])-float(min9[i]) != 0.0:
                    rsv = ((float(close)-float(min9[i]))/(float(max9[i])-float(min9[i])))                  
                else: rsv = float(kl[i-1])/100
                k = ((2/3)*float(kl[i-1]))+((100/3)*rsv)
                d = ((2/3)*float(dl[i-1]))+((1/3)*k)
                j = ((3*d)-(2*k))
                rsvl.append(rsv)
                kl.append(k)
                dl.append(d)
                jl.append(j)
            
            for i in range(len(data)-1):
                close1 = data[['close']].iloc[i].to_string(index=False)
                close2 = data[['close']].iloc[i+1].to_string(index=False)
                if float(close1) != 0.0:
                    inc = (100*((float(close2)-float(close1))/float(close1)))
                    incl.append(inc)
                    prel.append(inc)
                else:
                    incl.append("0")
                    prel.append("0")
                avg = data[['close']].iloc[i:i+2].mean().to_string(index=False)

                
                avgl.append(avg)
            
            for i in range(1, len(data)):
                close = data[['close']].iloc[i].to_string(index=False)
                if float(avgl[i]) != 0.0:
                    bias = (100*((float(close)-float(avgl[i]))/float(avgl[i])))
                    biasl.append(bias)
                else: biasl.append("0")
            
            prel.append("0")
            
            k1l = kl.copy()
            d1l = dl.copy()
            j1l = jl.copy()
            k2l = kl.copy()
            d2l = dl.copy()
            j2l = jl.copy()         
            
            k1l.insert(0, "0")
            d1l.insert(0, "0")
            j1l.insert(0, "0")
            k1l.pop()
            d1l.pop()
            j1l.pop()
            
            for i in range(2):
                k2l.insert(0, "0")
                d2l.insert(0, "0")
                j2l.insert(0, "0")
                k2l.pop()
                d2l.pop()
                j2l.pop()
                        
            data['inc'] = incl            
            data['max9'] = max9
            data['min9'] = min9
            data['rsv'] = rsvl
            data['k'] = kl
            data['d'] = dl
            data['j'] = jl
            data['avg'] = avgl
            data['bias'] = biasl
            data['k1'] = k1l
            data['d1'] = d1l
            data['j1'] = j1l
            data['k2'] = k2l
            data['d2'] = d2l
            data['j2'] = j2l
            
            if len(data)>=160:
                ema12l = list()
                ema26l = list()
                difl = list()
                macdl = list()
                dif_macdl = list()
            
                for i in range(11):
                    ema12l.append("0")
                
                for i in range(25):
                    ema26l.append("0")
                    difl.append("0")                    
                
                di12 = (float(data[['max']].iloc[:12].mean().to_string(index=False))+float(data[['min']].iloc[:12].mean().to_string(index=False))+float(data[['close']].iloc[:12].mean().to_string(index=False))*2)/4
                di26 = (float(data[['max']].iloc[:26].mean().to_string(index=False))+float(data[['min']].iloc[:26].mean().to_string(index=False))+float(data[['close']].iloc[:26].mean().to_string(index=False))*2)/4

                
                ema12l.append(di12)
                ema26l.append(di26)
                
                for i in range(12, len(data)):
                    close = data[['close']].iloc[i].to_string(index=False)
                    max1 = data[['max']].iloc[i].to_string(index=False)
                    min1 = data[['min']].iloc[i].to_string(index=False)
                    di = (float(close)*2+float(max1)+float(min1))/4
                    ema12today = (float(ema12l[i-1])*11 + di*2)/13
                    ema12l.append(ema12today)
                
                for i in range(26, len(data)):
                    close = data[['close']].iloc[i].to_string(index=False)
                    max1 = data[['max']].iloc[i].to_string(index=False)
                    min1 = data[['min']].iloc[i].to_string(index=False)
                    di = (float(close)*2+float(max1)+float(min1))/4
                    ema26today = (float(ema26l[i-1])*25 + di*2)/27
                    ema26l.append(ema26today)
                
                for i in range(25, len(data)):
                    difl.append(float(ema12l[i])-float(ema26l[i]))
                    
                for i in range(33):
                    macdl.append("0")
                    
                macdl.append(sum(difl[25:34])/9)
                
                for i in range(34, len(data)):
                    macdtoday = (float(macdl[i-1])*8 + float(difl[i])*2)/10
                    macdl.append(macdtoday)
                    
                for i in range(len(data)):
                    dif_macdl.append(float(difl[i])-float(macdl[i]))
                 
                    
                data['ema12'] = ema12l
                data['ema26'] = ema26l
                data['dif'] = difl
                data['macd'] = macdl
                data['dif-macd'] = dif_macdl
                
                data['pre'] = prel
            
        return data.iloc[40:]

In [None]:
stock = Stock('2330')
data = stock.scrape()

In [None]:
kdif = list()
kdif2 = list()
for i in range(len(data)):
    k = data['k'].iloc[i]
    k1 = data['k1'].iloc[i]
    k2 = data['k2'].iloc[i]
    kdif.append(float(k) - float(k1))
    kdif2.append(float(k1) - float(k2))
data['kdif'] = kdif
data['kdif2'] = kdif2

In [None]:
data.head()

In [None]:
column_names = list(data.columns.values)
column_names

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.drop('stock_id', inplace=True, axis=1)

In [None]:
data = data.astype({'inc':'float','pre':'float','max9':'float','min9':'float','rsv':'float','k':'float','d':'float','j':'float','avg':'float','bias':'float','ema12':'float','ema26':'float','dif':'float','macd':'float','k1':'float','d1':'float','j1':'float','k2':'float','d2':'float','j2':'float'})

In [None]:
data.corr()

In [None]:
plt.figure(figsize=(20, 19))
p = sns.heatmap(data.corr(), annot=True)

In [None]:
#testing
#x = np.empty([10 * 200, 5 * 5], dtype = float)
#y = np.empty([10 * 200, 1], dtype = float)
#xv = np.empty([10 * 50, 5 * 5], dtype = float)
#yv = np.empty([10 * 50, 1], dtype = float)
#test = np.empty([10, 5 * 5], dtype = float)

#train
x = np.empty([len(stock_num_mod) * 200, 5 * 5], dtype = float)
y = np.empty([len(stock_num_mod) * 200, 1], dtype = float)
xv = np.empty([len(stock_num_mod) * 50, 5 * 5], dtype = float)
yv = np.empty([len(stock_num_mod) * 50, 1], dtype = float)
test = np.empty([len(stock_num_mod), 5 * 5], dtype = float)

#testing
#for i in range(10):
#train
for i in range(len(stock_num_mod)):
    stock = Stock(stock_num_mod[i])
    #print(i)
    data = stock.scrape()
    for n in range(len(data)-51, len(data)-251, -1):        
        if data['close'].iloc[n-1] != 0.0:
            y[i*200 +(len(data)-n-51)] = (data['close'].iloc[n]-data['close'].iloc[n-1])/data['close'].iloc[n-1]
        for m in range(5):            
            x[i*200 +(len(data)-n-51), m] = data['k'].iloc[n-m-1]
            x[i*200 +(len(data)-n-51), m+5] = data['d'].iloc[n-m-1]
            x[i*200 +(len(data)-n-51), m+10] = data['j'].iloc[n-m-1]
            x[i*200 +(len(data)-n-51), m+15] = data['bias'].iloc[n-m-1]
            x[i*200 +(len(data)-n-51), m+20] = float(data['dif'].iloc[n-m-1]) - float(data['macd'].iloc[n-m-1])
                
    for n in range(len(data)-1, len(data)-51, -1):        
        if data['close'].iloc[n-1] != 0.0:
            yv[i*50 +(len(data)-n-1)] = (data['close'].iloc[n]-data['close'].iloc[n-1])/data['close'].iloc[n-1]
        for m in range(5):            
            xv[i*50 +(len(data)-n-1), m] = data['k'].iloc[n-m-1]
            xv[i*50 +(len(data)-n-1), m+5] = data['d'].iloc[n-m-1]
            xv[i*50 +(len(data)-n-1), m+10] = data['j'].iloc[n-m-1]
            xv[i*50 +(len(data)-n-1), m+15] = data['bias'].iloc[n-m-1]
            xv[i*50 +(len(data)-n-1), m+20] = float(data['dif'].iloc[n-m-1]) - float(data['macd'].iloc[n-m-1])
            
    for m in range(5):        
        test[i, m] = data['k'].iloc[len(data)-m-1]
        test[i, m+5] = data['d'].iloc[len(data)-m-1]
        test[i, m+10] = data['j'].iloc[len(data)-m-1]
        test[i, m+15] = data['bias'].iloc[len(data)-m-1]
        test[i, m+20] = float(data['dif'].iloc[n-m-1]) - float(data['macd'].iloc[n-m-1])

In [None]:
X_train = x
Y_train = y
X_dev = xv
Y_dev = yv
X_test = test
for i in range(len(Y_train)):
    if Y_train[i] > 0:
    #if Y_train[i] > 0.006:
        Y_train[i] = 1
    else: Y_train[i] = 0 
for i in range(len(Y_dev)):
    if Y_dev[i] > 0:
    #if Y_dev[i] > 0.006:
        Y_dev[i] = 1
    else: Y_dev[i] = 0 
train_size = X_train.shape[0]
dev_size = X_dev.shape[0]
test_size = X_test.shape[0]
data_dim = X_train.shape[1]
print('Size of training set: {}'.format(train_size))
print('Size of development set: {}'.format(dev_size))
print('Size of testing set: {}'.format(test_size))
print('Dimension of data: {}'.format(data_dim))

In [None]:
#y1 = 100*y
#yv1 = 100*yv

In [None]:
def _normalize(X, train = True, specified_column = None, X_mean = None, X_std = None):
    # This function normalizes specific columns of X.
    # The mean and standard variance of training data will be reused when processing testing data.
    #
    # Arguments:
    #     X: data to be processed
    #     train: 'True' when processing training data, 'False' for testing data
    #     specific_column: indexes of the columns that will be normalized. If 'None', all columns
    #         will be normalized.
    #     X_mean: mean value of training data, used when train = 'False'
    #     X_std: standard deviation of training data, used when train = 'False'
    # Outputs:
    #     X: normalized data
    #     X_mean: computed mean value of training data
    #     X_std: computed standard deviation of training data

    #if specified_column == None:
    #    specified_column = np.arange(X.shape[1])
    if train:
        X_mean = np.mean(X[:, specified_column] ,0).reshape(1, -1)
        X_std  = np.std(X[:, specified_column], 0).reshape(1, -1)

    X[:,specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)
     
    return X, X_mean, X_std

def _train_dev_split(X, Y, dev_ratio = 0.25):
    # This function spilts data into training set and development set.
    train_size = int(len(X) * (1 - dev_ratio))
    return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]

# Normalize training and testing data
specified_column1 = np.arange(5)
specified_column2 = np.arange(5, 10)
specified_column3 = np.arange(10, 15)
specified_column4 = np.arange(15, 20)
specified_column5 = np.arange(20, 25)
X_train, X_mean, X_std = _normalize(X_train, train = True, specified_column = specified_column1)
X_test, _, _= _normalize(X_test, train = False, specified_column = specified_column1, X_mean = X_mean, X_std = X_std)
X_train, X_mean, X_std = _normalize(X_train, train = True, specified_column = specified_column2)
X_test, _, _= _normalize(X_test, train = False, specified_column = specified_column2, X_mean = X_mean, X_std = X_std)
X_train, X_mean, X_std = _normalize(X_train, train = True, specified_column = specified_column3)
X_test, _, _= _normalize(X_test, train = False, specified_column = specified_column3, X_mean = X_mean, X_std = X_std)
X_train, X_mean, X_std = _normalize(X_train, train = True, specified_column = specified_column4)
X_test, _, _= _normalize(X_test, train = False, specified_column = specified_column4, X_mean = X_mean, X_std = X_std)
X_train, X_mean, X_std = _normalize(X_train, train = True, specified_column = specified_column5)
X_test, _, _= _normalize(X_test, train = False, specified_column = specified_column5, X_mean = X_mean, X_std = X_std)

X_dev, X_mean, X_std = _normalize(X_dev, train = True, specified_column = specified_column1)
X_dev, X_mean, X_std = _normalize(X_dev, train = True, specified_column = specified_column2)
X_dev, X_mean, X_std = _normalize(X_dev, train = True, specified_column = specified_column3)
X_dev, X_mean, X_std = _normalize(X_dev, train = True, specified_column = specified_column4)
X_dev, X_mean, X_std = _normalize(X_dev, train = True, specified_column = specified_column5)
    
# Split data into training set and development set
#dev_ratio = 0.1
#X_train, Y_train, X_dev, Y_dev = _train_dev_split(X_train, Y_train, dev_ratio = dev_ratio)

train_size = X_train.shape[0]
dev_size = X_dev.shape[0]
test_size = X_test.shape[0]
data_dim = X_train.shape[1]
print('Size of training set: {}'.format(train_size))
print('Size of development set: {}'.format(dev_size))
print('Size of testing set: {}'.format(test_size))
print('Dimension of data: {}'.format(data_dim))

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
    'seed': 1314,      # Your seed number, you can pick your lucky number. :)
    'select_all': False,   # Whether to use all features.
    'valid_ratio': 0.2,   # validation_size = train_size * valid_ratio
    'n_epochs': 2000,     # Number of epochs.            
    'batch_size': 64, 
    'learning_rate': 5e-4,
    'weight_decay': 5e-5,              
    'early_stop': 200,    # If model has not improved for this many consecutive epochs, stop training.     
    'k_folds': 5,
    # 'save_path': './models/model.ckpt'  # Your model will be saved here.
    'save_path': './models/'
}

In [None]:
class StockDataset(Dataset):
    '''
    x: Features.
    y: Targets, if none, do prediction.
    '''
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)

In [None]:
# Set seed for reproducibility
same_seed(config['seed'])
#train_data, valid_data = train_valid_split(train_data, config['valid_ratio'], config['seed'])

# Print out the data size.
print(f"""train_data size: {X_train.shape} 
valid_data size: {X_dev.shape} 
test_data size: {X_test.shape}""")

# Select features
#x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['select_all'])

# Print out the number of features.
print(f'number of features: {X_train.shape[1]}')

train_dataset, valid_dataset, test_dataset = StockDataset(X_train, Y_train), \
                                            StockDataset(X_dev, Y_dev), \
                                            StockDataset(X_test)

# For KFold
dataset = ConcatDataset([train_dataset, valid_dataset])

# Pytorch data loader loads pytorch dataset into batches.
# Without KFold
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

In [None]:
def trainer(fold, train_loader, valid_loader, model, config, device):
    
    #criterion = nn.MSELoss(size_average=True)
    criterion = nn.BCEWithLogitsLoss(reduction='mean')
    #sigmoid = nn.Sigmoid()
    # Define your optimization algorithm. 
    # Check https://pytorch.org/docs/stable/optim.html to get more available algorithms.
    # L2 regularization (optimizer(weight decay...) or implement by your self).
    #optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9)
    #optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9, weight_decay=config['weight_decay'])
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay']) 

    writer = SummaryWriter() # Writer of tensoboard.

    if not os.path.isdir('./models'):
        os.mkdir('./models') # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []
        correct = 0
        val_correct = 0
        # tqdm is a package to visualize your training progress.
        train_pbar = tqdm(train_loader, position=0, leave=True)

        for x, y in train_pbar:
            optimizer.zero_grad()               # Set gradient to zero.
            x, y = x.to(device), y.to(device)   # Move your data to device. 
            pred = model(x)            
            loss = criterion(pred, y)
            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())
            
            # For MSELoss
            #correct += (pred == y.T).float().sum()
            
            # For BCEWithLogitsLoss
            #print("y = ", y)
            #print("pred = ", pred)
            pred = pred > 0.5
            correct += (pred == y).float().sum()
            #print("correct = ", correct)
            
            # Display current epoch number and loss on tqdm progress bar.
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})

        mean_train_loss = sum(loss_record)/len(loss_record)
        writer.add_scalar('Loss/train', mean_train_loss, step)
        #print('correct = ', correct)
        #accuracy = 100 * correct / 2000
        accuracy = 100 * correct / 174200
        
        model.eval() # Set your model to evaluation mode.
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)                
                loss = criterion(pred, y)
                
                # For MSELoss
                #val_correct += (pred == y.T).float().sum()
                
                # For BCEWithLogitsLoss
                #print("y = ", y)
                #print("pred = ", pred)
                pred = pred > 0.5
                val_correct += (pred == y).float().sum()

            loss_record.append(loss.item())
            
        mean_valid_loss = sum(loss_record)/len(loss_record)
        #val_accuracy = 100 * val_correct / 500
        val_accuracy = 100 * val_correct / 43550
        
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train Acc: {accuracy:.4f}, Valid Acc: {val_accuracy:.4f}')
        writer.add_scalar('Loss/valid', mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path'] + fold + '_model.ckpt') # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            print('\nBest Loss = ' + str(best_loss))
            return

In [None]:
model = My_Model(input_dim=X_train.shape[1]).to(device) # put your model and data on the same computation device.
trainer('none', train_loader, valid_loader, model, config, device)

In [None]:
def save_pred(preds, file):
    ''' Save predictions to specified file '''
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id', 'raise'])
        for i, p in enumerate(preds):
            writer.writerow([stock_num_mod[i], p])

model = My_Model(input_dim=X_test.shape[1]).to(device)
model.load_state_dict(torch.load(config['save_path'] + 'none_model.ckpt'))
preds = predict(test_loader, model, device)
preds = preds > 0.5
save_pred(preds, 'none_pred.csv')

In [None]:
# KFold testing
model = My_Model(input_dim=X_train.shape[1]).to(device) # put your model and data on the same computation device.
kfold = KFold(n_splits=config['k_folds'], shuffle=True)

for fold, (train_ids, valid_ids) in enumerate(kfold.split(dataset)):
    print(f'FOLD {fold}')
    print('--------------------------------')
    train_subsampler = SubsetRandomSampler(train_ids)
    valid_subsampler = SubsetRandomSampler(valid_ids)
    train_loader = DataLoader(dataset, batch_size=config['batch_size'], sampler=train_subsampler, 
                              num_workers=0, pin_memory=True)
    valid_loader = DataLoader(dataset, batch_size=config['batch_size'], sampler=valid_subsampler, 
                              num_workers=0, pin_memory=True)

    trainer('Fold_' + str(fold),train_loader, valid_loader, model, config, device)
    
    model = My_Model(input_dim=X_test.shape[1]).to(device)
    model.load_state_dict(torch.load(config['save_path'] + 'Fold_' + str(fold) + '_model.ckpt'))
    preds = predict(test_loader, model, device)
    preds = preds > 0.5
    save_pred(preds, 'Fold_' + str(fold) + '_pred.csv')

In [None]:
def _shuffle(X, Y):
    # This function shuffles two equal-length list/array, X and Y, together.
    randomize = np.arange(len(X))
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])

def _sigmoid(z):
    # Sigmoid function can be used to calculate probability.
    # To avoid overflow, minimum/maximum output value is set.
    return np.clip(1 / (1.0 + np.exp(-z)), 1e-8, 1 - (1e-8))

def _f(X, w, b):
    # This is the logistic regression function, parameterized by w and b
    #
    # Arguements:
    #     X: input data, shape = [batch_size, data_dimension]
    #     w: weight vector, shape = [data_dimension, ]
    #     b: bias, scalar
    # Output:
    #     predicted probability of each row of X being positively labeled, shape = [batch_size, ]
    return _sigmoid(np.matmul(np.multiply(X, X), w[:, 0]) + np.matmul(X, w[:, 1]) + b)

def _predict(X, w, b):
    # This function returns a truth value prediction for each row of X 
    # by rounding the result of logistic regression function.
    return np.round(_f(X, w, b)).astype(np.int)
    
def _accuracy(Y_pred, Y_label):
    # This function calculates prediction accuracy    
    acc = 1 - np.mean(np.abs(Y_pred - Y_label.reshape(1, -1)))
    print(acc)
    return acc

In [None]:
def _cross_entropy_loss(y_pred, Y_label):
    # This function computes the cross entropy.
    #
    # Arguements:
    #     y_pred: probabilistic predictions, float vector
    #     Y_label: ground truth labels, bool vector
    # Output:
    #     cross entropy, scalar
    cross_entropy = -np.dot(Y_label, np.log(y_pred)) - np.dot((1 - Y_label), np.log(1 - y_pred))
    return cross_entropy

def _gradient(X, Y_label, w, b):
    # This function computes the gradient of cross entropy loss with respect to weight w and bias b.
    y_pred = _f(X, w, b)
    pred_error = Y_label - y_pred
    w_grad1 = -np.sum(pred_error * np.multiply(X, X).T, 1)
    w_grad2 = -np.sum(pred_error * X.T, 1)
    b_grad = -np.sum(pred_error)
    return w_grad1, w_grad2, b_grad

In [None]:
# Regression
w = np.zeros((data_dim, 2)) 
b = np.zeros((1,))

# Some parameters for training    
max_iter = 100
batch_size = 64
learning_rate = 0.1

# Keep the loss and accuracy at every iteration for plotting
train_loss = []
dev_loss = []
train_acc = []
dev_acc = []

# Calcuate the number of parameter updates
step = 1

# Iterative training
for epoch in range(max_iter):
    # Random shuffle at the begging of each epoch
    X_train, Y_train = _shuffle(X_train, Y_train)
        
    # Mini-batch training
    for idx in range(int(np.floor(train_size / batch_size))):
        Xb = X_train[idx*batch_size:(idx+1)*batch_size]
        Yb = Y_train[idx*batch_size:(idx+1)*batch_size].reshape(1, -1)
        
        
        # Compute the gradient
        w_grad1, w_grad2, b_grad = _gradient(Xb, Yb, w, b)
            
        # gradient descent update
        # learning rate decay with time
        w[:, 0] = w[:, 0] - learning_rate/np.sqrt(step) * w_grad1
        w[:, 1] = w[:, 1] - learning_rate/np.sqrt(step) * w_grad2
        b = b - learning_rate/np.sqrt(step) * b_grad
        
        step = step + 1
            
    # Compute loss and accuracy of training set and development set
    y_train_pred = _f(X_train, w, b)
    Y_train_pred = np.round(y_train_pred) 
    
    train_acc.append(_accuracy(Y_train_pred, Y_train))
    train_loss.append(_cross_entropy_loss(y_train_pred, Y_train.reshape(1, -1)) / train_size)

    y_dev_pred = _f(X_dev, w, b)
    Y_dev_pred = np.round(y_dev_pred)
    
    dev_acc.append(_accuracy(Y_dev_pred, Y_dev))
    dev_loss.append(_cross_entropy_loss(y_dev_pred, Y_dev.reshape(1, -1)) / dev_size)


print('Training loss: {}'.format(train_loss[-1]))
print('Development loss: {}'.format(dev_loss[-1]))
print('Training accuracy: {}'.format(train_acc[-1]))
print('Development accuracy: {}'.format(dev_acc[-1]))

In [None]:
#print(w)
#print(b)
ans_y = _predict(X_test, w, b)
print(ans_y)