In [1]:
import sys
import re
import pymysql
import pandas as pd
import numpy as np
import datetime
import argparse
import csv
import logging
import os
from tqdm import tqdm,tqdm_notebook, trange
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef, confusion_matrix, multilabel_confusion_matrix
from matplotlib import pyplot as plt 

import torch
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,TensorDataset, Subset)
from torchvision import datasets, models, transforms



logging.basicConfig(level = logging.INFO)
logger = logging.getLogger(__name__)

# Model

In [2]:
class LSTM_Line_Regression(nn.Module):
    def __init__(self,basic_input_size, dynamic_input_size, output_size=2, hidden_size = 32, num_layers=3):
        super(LSTM_Line_Regression,self).__init__()
        
        self.basic_input_size = basic_input_size
        self.basic_output_size = 32
        
        
        self.dynamic_input_size = dynamic_input_size
        self.dynamic_output_size = hidden_size
        self.drop_prob = 0.4
        self.hidden_size = hidden_size
        
        self.concate_size = self.dynamic_output_size + self.basic_output_size
        self.final_hidden_size = 16
        
        self.line_basic = nn.Sequential(
             nn.Linear(self.basic_input_size, self.basic_output_size),
             nn.ReLU(),
             nn.Dropout(p=self.drop_prob),
             nn.Linear(self.basic_output_size, self.basic_output_size))
        
        
        self.lstm = nn.LSTM(dynamic_input_size, hidden_size, num_layers, batch_first=True, dropout=0.4)
        
        self.line_1 = nn.Linear(self.concate_size, self.final_hidden_size)
        #nn.init.kaiming_normal_(self.line_1.weight, mode='fan_out')
        self.line_final = nn.Linear(self.final_hidden_size, output_size)
        #nn.init.kaiming_normal_(self.line_final.weight, mode='fan_out')
        self.dropout = nn.Dropout(self.drop_prob)
        
        
        
    def forward(self, basic, dynamic):
        # dynamic shape (batch, seq_len , input_size)
        
        # dynamic
        d_out, (_, _) = self.lstm(dynamic)  
        d_out = d_out[:, -1, :]
        
        # basic
        b_out = self.line_basic(basic)
        
        # concate
        cat = torch.cat((b_out, d_out), 1)
        out = F.relu(cat)
        
        out = F.relu(self.dropout(self.line_1(out)))
        out = self.line_final(out)
        return out

In [3]:
class LSTM_Line_Regression_2(nn.Module):

    def __init__(self,basic_input_size, dynamic_input_size, output_size=1, hidden_size = 32, num_layers=3):
        super(LSTM_Line_Regression_2,self).__init__()
        self.basic_input_size = basic_input_size
        self.basic_output_size = 32
        
        self.dynamic_input_size = dynamic_input_size
        self.dynamic_output_size = hidden_size
        self.drop_prob = 0.4
        self.hidden_size = hidden_size
        
        self.concate_size = self.dynamic_output_size + self.basic_output_size
        
        self.lstm = nn.LSTM(dynamic_input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.line_1 = nn.Linear(self.concate_size, 32)
        nn.init.kaiming_normal_(self.line_1.weight, mode='fan_out')
        self.line_final = nn.Linear(self.concate_size, 1)
        #nn.init.kaiming_normal_(self.line_final.weight, mode='fan_out')
        self.dropout = nn.Dropout(self.drop_prob)
        
        self.line_basic = nn.Sequential(
             nn.Linear(self.basic_input_size, self.basic_output_size),
             nn.ReLU(),
             nn.Dropout(p=self.drop_prob),
             nn.Linear(self.basic_output_size, self.basic_output_size))
        
    def forward(self, basic, dynamic):
        # dynamic shape (batch, seq_len , input_size)
        
        # dynamic
        d_out, (_, _) = self.lstm(dynamic)  
        d_out = d_out[:, -1, :]
        
        # basic
        # b_out = self.line_basic(basic)
        
        # concate
        #out = torch.cat((b_out, d_out), 1)
        out = F.relu(d_out)
        
        #out = F.relu(self.dropout(self.line_1(out)))
        out = self.line_final(out)
        return out

In [4]:
class TrainModel():

    def __init__(self,
                 model=LSTM_Line_Regression(5, 5),
                 train_batch_size=128,
                 data_class=None,
                 device=torch.device("cpu"),
                 train_dataset=None,
                 dev_dataset=None
                 ):

        self.model = model
        self.train_batch_size = train_batch_size
        self.device = device
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.optimizer = None
        self.scheduler = None
        self.default_setting()

    def default_setting(self):

        self.loss_function = torch.nn.CrossEntropyLoss()

        # pw = torch.tensor([0.5]).to(torch.device(device))
        # loss_function = torch.nn.BCEWithLogitsLoss(pos_weight=pw) #

        adam_epsilon = 1e-8
        weight_decay = 1e-4
        LEARNING_RATE = 1e-3
        model = self.model
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        self.optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=LEARNING_RATE, eps=adam_epsilon)
        self.optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=1)

        # store perfomation
        self.min_dev_loss = float('inf')
        self.best_acc = -float('inf')

        self.train_results = []
        self.dev_results = []
        self.lr_list = []

        self.model.to(self.device)
        self.loss_function.to(self.device)

    def train_epoch(self, N_EPOCHS, fpath='./buffer.bin'):
        for epoch in trange(N_EPOCHS):
            # logger.info('_____ Epoch %s begin _____'% epoch)
            res, preds, model_output = self.train_func(self.train_dataset, self.model)
            self.train_results.append(res)

            res, preds, model_output = self.dev_func(self.dev_dataset, self.model)
            self.dev_results.append(res)

            self.lr_list.append(self.optimizer.state_dict()['param_groups'][0]['lr'])

            dev_acc = res['acc']
            if dev_acc > self.best_acc:
                # tqdm.write("@@@@@ Save MAX_ACC model @@@@@")
                # logger.info('file_path = %S'% output_model_file)
                self.best_acc = dev_acc
                torch.save(self.model, fpath)
                self.acc_model = torch.load(fpath)

            dev_loss = res['loss']
            if dev_loss < self.min_dev_loss:
                # tqdm.write("@@@@@ Save MIN_LOSS model @@@@@")
                # logger.info('file_path = %S'% output_model_file)
                self.min_dev_loss = dev_loss
                torch.save(self.model, fpath)
                self.loss_model = torch.load(fpath)

        logger.info("_____ Train finish _____")

        return self.train_results, self.dev_results, preds

    def save_model(self, output_model_file=None):
        logger.info("@@@@@ Save best model @@@@@")
        logger.info('file_path = %S' % output_model_file)
        torch.save(self.model, output_model_file)
        # model.config.to_json_file(output_config_file)
        # tokenizer.save_vocabulary(OUTPUT_DIR)
        return True

    def show_lr(self):
        # plot LEARNING_RATE
        plt.plot(range(len(self.lr_list)), self.lr_list, color='r')
        plt.show()

    def show_statistic(self):
        train_results = self.train_results
        dev_results = self.dev_results
        data_class = self.data_class
        logger.info(' TRAIN\n' + str(get_class_ratio(data_class.t_all_output)))
        logger.info(' DEV\n' + str(get_class_ratio(data_class.d_all_output)))

        show_map(train_results, dev_results, 'acc', data_class=data_class)
        show_map(train_results, dev_results, 'loss')

        for i in range(self.num_classes):
            plt_class(train_results, dev_results, data_class, i)

    def show_precision(self, results):
        logger.info('dev')
        logger.info('acc = %.3f' % results[-1]['acc'])
        for i in range(self.num_classes):
            #     print('class %s mcm = \n%s' % (i,dev_results[-1]['mcm'][i]))
            logger.info('class %s Precision = %.3f' % (i, get_precision(results[-1]['mcm'][i].ravel())))

    def train_func(self, sub_dataset, model):
        model.train()
        
        train_loss = 0
        train_acc = 0
        labels = []
        preds = []
        
        mysampler = RandomSampler(sub_dataset)
        data_loader = DataLoader(sub_dataset, sampler=mysampler, batch_size=self.train_batch_size)

        for step, batch in enumerate(data_loader):
            # train 
            self.optimizer.zero_grad()
            basic, dynamic, _y = batch
            # print(basic)
            # print(_y.shape)
            out = model(basic, dynamic)
            loss = self.loss_function(out, _y)
            # print(out)
            # print(loss.item())
            train_loss += loss.item()
            train_acc += (out.argmax(1) == _y).sum().item()
            loss.backward()
            self.optimizer.step()

            labels += _y.cpu().numpy().tolist()
            preds += out.argmax(1).cpu().numpy().tolist()

        self.scheduler.step()

        # evaluate
        result = {}
        result = get_eval_report('train', labels, preds)
        result['loss'] = train_loss / len(data_loader)
        result['acc'] = train_acc / len(sub_dataset)

        # print(preds)
        model_output = out.data.cpu().numpy()
        return result, preds, model_output

    def dev_func(self, sub_dataset, model):
        model.eval()

        total_loss = 0
        total_acc = 0
        labels = []
        preds = []

        basic, dynamic, _y = sub_dataset
        # dev
        out = model(basic, dynamic)
        loss = self.loss_function(out, _y)
        total_loss += loss.item()
        total_acc += (out.argmax(1) == _y).sum().item()

        # evaluate
        result = {}
        labels += _y.cpu().numpy().tolist()
        preds += out.argmax(1).cpu().numpy().tolist()
        result = get_eval_report('dev', labels, preds)
        result['loss'] = total_loss / 1
        result['acc'] = total_acc / len(_y)

        # print(preds)
        model_output = out.data.cpu().numpy()
        return result, preds, model_output






In [None]:

def train_func(sub_dataset,model):
    model.train()
    
    train_loss = 0
    train_acc = 0
    labels = []
    preds = []
    
    mysampler = RandomSampler(sub_dataset)
    data_loader = DataLoader(sub_dataset, sampler=mysampler, batch_size=train_batch_size)
    
    
    for step, batch in enumerate(data_loader):
        # train 
        optimizer.zero_grad()
        basic, dynamic, _y = batch
        #print(basic)
        #print(_y.shape)
        out = model(basic, dynamic)
        loss = loss_function(out, _y)
        #print(out)
        #print(loss.item())
        train_loss += loss.item()
        train_acc += (out.argmax(1) == _y).sum().item()
        loss.backward()
        optimizer.step()
        
        labels += _y.cpu().numpy().tolist()
        preds += out.argmax(1).cpu().numpy().tolist()
    
    scheduler.step()
    
    # evaluate
    result = {}
    result = get_eval_report('train', labels, preds)
    result['loss'] = train_loss / len(data_loader)
    result['acc'] = train_acc / len(sub_dataset)
    
    #print(preds)
    model_output = out.data.cpu().numpy()
    return result, preds, model_output

def get_eval_report(task_name, labels, preds):
    assert len(preds) == len(labels)
    # processor.get_labels()
    mcm = multilabel_confusion_matrix(labels, preds, labels=list(range(2)))
    return {
        "task": task_name,
        "mcm": mcm
    }

def show_eval_report(result,name = ''):
    logger.info("***** Eval %s results *****" %name )
    for key in (result.keys()):
        logger.info("  %s = %s", key, str(result[key]))
        

def dev_func(sub_dataset, model):
    model.eval()
    
    total_loss = 0
    total_acc = 0
    labels = []
    preds = []
     
    basic, dynamic, _y = sub_dataset
    # dev
    out = model(basic, dynamic)
    loss = loss_function(out, _y)
    total_loss += loss.item()
    total_acc += (out.argmax(1) == _y).sum().item()
    
    
    # evaluate
    result = {}
    labels += _y.cpu().numpy().tolist()
    preds += out.argmax(1).cpu().numpy().tolist()
    result = get_eval_report('dev', labels, preds)
    result['loss'] = total_loss / 1
    result['acc'] = total_acc / len(_y)
    
    #print(preds)
    model_output = out.data.cpu().numpy()
    return result, preds, model_output

   
    

In [None]:
def show_map(train_results, dev_results, indicator, t_all_output,d_all_output):
    x = np.arange(0, len(train_results))
    y_train = [res[indicator] for res in train_results]
    y_dev = [res[indicator] for res in dev_results]
    # plt.rcParams['figure.dpi'] = 300
    plt.title("%s trend" % indicator)
    plt.xlabel("Epoch")
    plt.ylabel(indicator)
    if indicator == 'acc':
        plt.ylim((0, 1.0))
        if True:
            train_max_class_ratio = get_max_class_ratio(t_all_output)
            dev_max_class_ratio = get_max_class_ratio(d_all_output)
            plt.axhline(y=train_max_class_ratio, color='r', linewidth=0.5)
            plt.axhline(y=dev_max_class_ratio, color='b', linewidth=0.5)

    plt.plot(x, y_train, color='red', label='train')
    plt.plot(x, y_dev, color='blue', label='dev')
    plt.legend()
    plt.show()


def get_precision(np):
    tn, fp, fn, tp = np
    return tp / (tp + fp)


def get_recall(np):
    tn, fp, fn, tp = np
    return tp / (tp + fn)


def get_F1_score(np):
    tn, fp, fn, tp = np
    P_precision = tp / (tp + fp)
    P_recall = tp / (tp + fn)
    return 2 * (P_precision * P_recall) / (P_precision + P_recall)


eval_functions = {
    'precision': get_precision,
    'recall': get_recall,
    'F1_score': get_F1_score
}


def plt_class(train_results, dev_results, t_all_output,d_all_output, k):
    x = np.arange(0, len(train_results))
    for name, func in eval_functions.items():
        y_train = [func(res['mcm'][k].ravel()) for res in train_results]
        y_dev = [func(res['mcm'][k].ravel()) for res in dev_results]
        plt.title(f"CLASS {k} {name}")
        plt.xlabel("Epoch")
        plt.ylabel(f'{name}')
        plt.ylim((0, 1.0))
        plt.plot(x, y_train, color='red', label='train')
        plt.plot(x, y_dev, color='blue', label='dev')
        if (name == 'precision'):
            train_max_class_ratio = get_class_ratio(t_all_output)[k]
            dev_max_class_ratio = get_class_ratio(d_all_output)[k]
            plt.axhline(y=train_max_class_ratio, color='r', linewidth=1)
            plt.axhline(y=dev_max_class_ratio, color='b', linewidth=1)
        plt.show()


def get_class_ratio(all_output):
    b = pd.DataFrame(all_output)
    a = b[0].value_counts(sort=False, normalize=True).sort_index()
    return a


def get_max_class_ratio(all_output):
    b = pd.DataFrame(all_output)
    a = b[0].value_counts(sort=False, normalize=True).sort_index()
    return a.max()






In [6]:
if __name__ == '__main__':
    model = LSTM_Line_Regression(5,5)
    LSTM_Line_Regression_2(5,5)
    
    model_class = TrainModel(model=LSTM_Line_Regression,
                             days_for_train = 10,
                             train_batch_size=128)