In [17]:
import torch 
import time 
from sklearn.model_selection import KFold,StratifiedKFold

import torch.nn as nn
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from torch.utils.data.dataloader import DataLoader
from torch.utils.tensorboard import SummaryWriter
from gensim.models.phrases import Phrases
from gensim.models import Word2Vec

from src.train_utils import set_seed, ModelSave, get_torch_device, EarlyStop, TrainParams
from src.metric import  multi_cls_metrics,multi_cls_log
from src.dataset.tokenizer import GensimTokenizer
from src.enhancement.consistency import  create_ema_model, MeanTeacher

from iflytek_app.dataset import MixDataset
from iflytek_app.process import train_process, test_process, result_process,kfold_inference
from iflytek_app.models import Textcnn, TextcnnAugment

device = get_torch_device()
set_seed()

No GPU available, using the CPU instead.


In [18]:
phraser = Phrases.load('./checkpoint/phrase_tokenizer')
c2v = GensimTokenizer( Word2Vec.load('./checkpoint/char_min1_win5_sg_d100'))
w2v = GensimTokenizer(Word2Vec.load('./checkpoint/phrase_min1_win5_sg_d100'), phraser)
w2v.init_vocab()
c2v.init_vocab()

df, label2idx = train_process()
test = test_process()
label2idx.update({'unlabel':-1})

                id           l1           l2          len
count  4199.000000  4199.000000  4199.000000  4199.000000
mean   2099.000000     8.969278    37.087878    46.057156
std    1212.291219     4.576621    79.204914    79.332999
min       0.000000     2.000000     1.000000     4.000000
25%    1049.500000     5.000000     6.000000    15.000000
50%    2099.000000     8.000000    12.000000    22.000000
75%    3148.500000    12.000000    26.000000    36.000000
max    4198.000000    32.000000   946.000000   961.000000
{'14784131 14858934 14784131 14845064': 0, '14852788 14717848 15639958 15632020': 1, '14844856 14724258 14925237 14854807': 2, '14925756 15639967 14853254 14728639': 3, '14844593 14924945': 4, '15709098 14716590 14924703 14779559': 5, '14726332 14728344 14854542 14844591': 6, '14858934 15636660 15704193 14849963': 7, '15710359 14847407 14845602 14859696': 8, '14794687 14782344': 9, '15630486 15702410 14718849 15709093': 10, '15632285 15706536 14721977 14925219': 11, '147829

In [41]:
# -*-coding:utf-8 -*-
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F


def ramp_up(cur_epoch, max_epoch, method):
    """
    根据训练epoch来调整无标注loss部分的权重，初始epoch无标注loss权重为0
    """

    def linear(cur_epoch, max_epoch):
        return cur_epoch / max_epoch

    def sigmoid(cur_epoch, max_epoch):
        p = 1.0 - cur_epoch / max_epoch
        return np.exp(-5.0 * p ** 2)

    def cosine(cur_epoch, max_epoch):
        p = cur_epoch / max_epoch
        return 0.5 * (np.cos(np.pi * p) + 1)

    if cur_epoch == 0:
        weight = 0.0
    else:
        if method == 'linear':
            weight = linear(cur_epoch, max_epoch)
        elif method == 'sigmoid':
            weight = sigmoid(cur_epoch, max_epoch)
        elif method == 'cosine':
            weight = cosine(cur_epoch, max_epoch)
        else:
            raise ValueError('Only linear, sigmoid, cosine method are supported')
    return weight

def create_ema_model(model):
    for param in model.parameters():
        param.detach_()
    return model


class MeanTeacher(object):
    def __init__(self, model, ema_model, tp, tb):
        self.model = model
        self.ema_model = ema_model
        self.tp = tp
        self.tb = tb
        self.global_step = 0
        self.epoch = 0
        self.log_step = self.tp.log_steps
        self.num_train_steps = tp.num_train_steps
        self.loss_fn = tp.loss_fn
        self.epoch_size = tp.epoch_size
        self.ramp_up_method = tp.ramp_up_method
        self.wmax = tp.max_unsupervised * tp.labeled_size / tp.num_train_steps
        self.alpha = tp.alpha
        self.decay = 0.02 * tp.lr

        # use state dict instead of named_parameters when batch norm exits
        for param, ema_param in zip(self.model.state_dict().values(), self.ema_model.state_dict().values()):
            param.data.copy_(ema_param.data)

    def step(self):
        self.global_step +=1
        self.epoch = int(self.global_step//self.num_train_steps)
        # alpha = min(1 - 1 /(self.step+1), self.alpha)
        for name in self.model.state_dict():
            param = self.model.state_dict()[name]
            ema_param = self.ema_model.state_dict()[name]
            if ema_param.dtype==torch.float32:
                ema_param.mul_(self.alpha)
                ema_param.add_(param * (1-self.alpha))
                param.mul_(1 - self.decay)
                if self.global_step % self.log_step==0:
                    self.tb.add_histogram(name, param, global_step=self.global_step)
                    self.tb.add_histogram(name+'_ema', ema_param, global_step=self.global_step)

    def compute_loss(self, features, logits):
        weight = ramp_up(self.epoch, self.epoch_size, self.ramp_up_method) * self.wmax

        labels = features['label']
        cond = labels >= 0
        # supervised_loss
        self.supervised_loss = self.loss_fn(logits[cond], labels[cond])

        # consistency loss
        with torch.no_grad():
            teacher_logits = self.ema_model(features)
            self.teacher_loss = self.loss_fn(teacher_logits[cond], labels[cond])

        self.consistency_loss = torch.mean((F.softmax(logits, dim=1) - F.softmax(teacher_logits, dim=1)) ** 2)

        self.tb.add_scalars('loss/sup_loss', {
            'student': self.supervised_loss,
            'teacher': self.teacher_loss
        }, global_step=self.global_step)

        self.tb.add_scalar('loss/consistency', self.consistency_loss, global_step=self.global_step)
        self.tb.add_scalar('loss/weight', weight, global_step=self.global_step)
        loss = self.supervised_loss + weight * self.consistency_loss
        return loss

In [43]:
log_steps = 10
save_steps = 20
kfold=5
tp = TrainParams(
    log_steps = log_steps,
    save_steps = save_steps,
    epoch_size=30,
    lr=1e-3,
    loss_fn=nn.CrossEntropyLoss(),
    max_seq_len=1000,
    batch_size=64,
    dropout_rate=0.5,
    label_size = len(label2idx),
    vocab_size = w2v.vocab_size,
    embedding_dim = w2v.embedding_size + c2v.embedding_size,
    embedding1=c2v.embedding, 
    embedding2 =w2v.embedding,
    filter_size=70,
    kernel_size_list = [2,3,4,5],
    hidden_size = 100,
    early_stop_params = {
        'monitor':'f1_micro',
        'mode':'max',
        'min_delta': 0,
        'patience':5,
        'verbose':False
    },
    scheduler_params={'mode': 'max',
                     'factor': 0.3,
                     'patience': 1,
                     'verbose': True,
                     'threshold':0.0001,
                     'threshold_mode':'rel',
                     'cooldown':0,
                     'min_lr':1e-6},
    alpha=0.99,
    max_unsupervised=50,
    labeled_size = int(df.shape[0]/kfold *(kfold-1)),
    ramp_up_method='sigmoid',
    spatial_dropout_rate=0.5
)

In [9]:
minor_id = df.loc[df['label'].isin(['14784131 14858934 14784131 14845064',
                                    '14852788 14717848 15639958 15632020']),'id'].values.tolist()

In [49]:
kf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=24)
for fold,(train_index, valid_index) in enumerate(kf.split(df.values, df[['label']].values)):
    train, valid = df.iloc[train_index], df.iloc[valid_index]

    # combine label and unlabel data
    train_dataset = MixDataset(tp.max_seq_len, w2v, c2v, phraser, label2idx, 
                               train['name'].values.tolist(),train['description'].values.tolist() ,
                               train['label'].values.tolist())

    valid_dataset = MixDataset(tp.max_seq_len, w2v, c2v, phraser, label2idx, 
                               valid['name'].values, valid['description'].values, valid['label'].values)
    train_sampler = RandomSampler(train_dataset)

    valid_sampler = SequentialSampler(valid_dataset)
    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=tp.batch_size)
    valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=int(tp.batch_size*2))
    unlabel_sampler = RandomSampler(unlabel_dataset)
    unlabel_dataset = MixDataset(tp.max_seq_len, w2v, c2v, phraser, label2idx,
                             test['name'].values, test['description'].values, ['unlabel'] *test.shape[0])
    unlabel_loader = DataLoader(unlabel_dataset, sampler=unlabel_sampler, batch_size=tp.batch_size)
    
    tp.update({'num_train_steps': len(train_loader)})
    CKPT = './checkpoint/textcnn_mean_teacher/k{}'.format(fold)
    saver = ModelSave(CKPT, continue_train=False)
    saver.init()
    
    tb = SummaryWriter(CKPT)
    es = EarlyStop(**tp.early_stop_params)

    global_step = 0
    model = TextcnnAugment(tp)
    model_ema = create_ema_model( TextcnnAugment(tp))
    mean_teacher = MeanTeacher(model, model_ema, tp, tb)
    optimizer, scheduler = model.get_optimizer()
    
    for epoch_i in range(tp['epoch_size']):
        if global_step==1:
            print(model)
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10}  | {'Elapsed':^9}")
        print("-"*60)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()
        total_loss, batch_loss, batch_counts = 0, 0, 0

        model.train()
        unlabel_iter = iter(unlabel_loader)
        for step, batch in enumerate(train_loader):
            try:
                unlabel_batch = next(unlabel_iter)
            except:
                unlabel_iter = iter(unlabel_loader)
                unlabel_batch = next(unlabel_iter)
            global_step +=1
            batch_counts +=1

            #Forward propogate
            model.zero_grad()
            feature = {k:v.to(device) for k,v in batch.items()}
            #eature = {k:torch.cat([v,unlabel_batch[k]],dim=0).to(device) for k,v in batch.items()}
            logits = model(feature)
            loss = mean_teacher.compute_loss(feature, logits)
            tb.add_scalar('loss/avg_loss', loss, global_step=global_step)
            
            batch_loss += loss.item()
            total_loss += loss.item()
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimizer.step()
            mean_teacher.step()
            
            # Log steps for train loss logging
            if (step % tp.log_steps == 0 and step != 0) or (step == len(train_loader) - 1):
                time_elapsed = time.time() - t0_batch
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^9} | {time_elapsed:^9.2f}")
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

            # Save steps for ckpt saving and dev evaluation
            if (step % tp.save_steps == 0 and step != 0) or (step == len(train_loader) - 1):
                val_metrics = multi_cls_metrics(model_ema, valid_loader, device)

                for key, val in val_metrics.items():
                    tb.add_scalar(f'metric/{key}', val, global_step=global_step)
                avg_train_loss = total_loss / step
                tb.add_scalars('loss/train_valid',{'train': avg_train_loss,
                                                    'valid': val_metrics['val_loss']}, global_step=global_step)
                
                saver(total_loss / step, val_metrics['val_loss'], epoch_i, global_step, model_ema, optimizer, scheduler)
        # On Epoch End: calcualte train & valid loss and log overall metrics
        time_elapsed = time.time() - t0_epoch
        val_metrics = multi_cls_metrics(model_ema, valid_loader, device)
        avg_train_loss = total_loss / step
        scheduler.step(val_metrics['f1_micro'])
        print("-"*70)
        print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_metrics['val_loss']:^10.6f} | {time_elapsed:^9.2f}")
        multi_cls_log(epoch_i, val_metrics)
        print("\n")
        if es.check(val_metrics):
            break



./checkpoint/textcnn_mean_teacher/k0 model cleaned
 Epoch  |  Batch  |  Train Loss  |  Val Loss   |  Elapsed 
------------------------------------------------------------
   1    |   10    |   5.151386   |     -     |   12.41  
   1    |   20    |   2.990473   |     -     |   10.66  




   1    |   30    |   2.663738   |     -     |   14.00  
   1    |   40    |   2.516852   |     -     |   10.51  
   1    |   50    |   2.356057   |     -     |   13.87  
   1    |   52    |   2.164567   |     -     |   1.56   
----------------------------------------------------------------------
   1    |    -    |   3.197415   |  2.780704  |   66.36  


 Epoch  | Macro Acc | Macro AUC | Macro AP  | Macro Precision | Macro Recall | Macro F1 
------------------------------------------------------------------------------------------
   1    |  12.311%  |  58.950%  |  18.401%  |     20.033%     |   12.311%    |  30.833%  
 Epoch  | Micro Acc | Micro AUC | Micro AP  | Micro Precision | Micro Recall | Micro F1 
------------------------------------------------------------------------------------------
   1    |  30.833%  |     -     |     -     |     30.833%     |   30.833%    |  30.833%  




 Epoch  |  Batch  |  Train Loss  |  Val Loss   |  Elapsed 
--------------------------------------

   9    |   10    |   2.189242   |     -     |   11.38  
   9    |   20    |   2.148754   |     -     |   10.41  
   9    |   30    |   2.191928   |     -     |   13.65  
   9    |   40    |   2.196785   |     -     |   10.30  
   9    |   50    |   2.161627   |     -     |   13.69  
   9    |   52    |   2.235143   |     -     |   1.54   
----------------------------------------------------------------------
   9    |    -    |   2.221979   |  2.182949  |   64.27  


 Epoch  | Macro Acc | Macro AUC | Macro AP  | Macro Precision | Macro Recall | Macro F1 
------------------------------------------------------------------------------------------
   9    |  39.910%  |  75.018%  |  37.795%  |     51.286%     |   39.910%    |  63.452%  
 Epoch  | Micro Acc | Micro AUC | Micro AP  | Micro Precision | Micro Recall | Micro F1 
------------------------------------------------------------------------------------------
   9    |  63.452%  |     -     |     -     |     63.452%     |   63.452%    

   3    |   30    |   2.213475   |     -     |   14.48  
   3    |   40    |   2.186375   |     -     |   12.45  
   3    |   50    |   2.150411   |     -     |   17.99  
   3    |   52    |   2.234103   |     -     |   2.08   
----------------------------------------------------------------------
   3    |    -    |   2.260416   |  2.253907  |   73.43  


 Epoch  | Macro Acc | Macro AUC | Macro AP  | Macro Precision | Macro Recall | Macro F1 
------------------------------------------------------------------------------------------
   3    |  31.791%  |  71.183%  |  32.356%  |     39.203%     |   31.791%    |  55.833%  
 Epoch  | Micro Acc | Micro AUC | Micro AP  | Micro Precision | Micro Recall | Micro F1 
------------------------------------------------------------------------------------------
   3    |  55.833%  |     -     |     -     |     55.833%     |   55.833%    |  55.833%  




 Epoch  |  Batch  |  Train Loss  |  Val Loss   |  Elapsed 
--------------------------------------

KeyboardInterrupt: 

In [51]:
for i in model.named_parameters():
    print(i)

('embedding1.weight', Parameter containing:
tensor([[-0.3901,  0.2356,  0.2300,  ..., -0.1065,  0.2393, -0.0179],
        [-0.3489,  0.1862,  0.0886,  ..., -0.0276, -0.0638,  0.1627],
        [-0.5828,  0.0614,  0.0103,  ..., -0.2199,  0.5395, -0.1665],
        ...,
        [ 0.7524, -0.9165,  0.8643,  ..., -1.2435,  0.9184, -0.1838],
        [ 0.0174, -0.0267,  0.0235,  ..., -0.0126,  0.0152, -0.0129],
        [-0.5238,  1.0917, -0.7439,  ...,  0.6906,  0.0059,  0.0552]],
       requires_grad=True))
('embedding2.weight', Parameter containing:
tensor([[-0.1464,  0.1063,  0.1207,  ...,  0.0991,  0.3511,  0.1225],
        [-0.1102,  0.1911,  0.1718,  ...,  0.1894, -0.2257,  0.1983],
        [-0.1472,  0.4720, -0.2492,  ...,  0.2005,  0.2968, -0.0251],
        ...,
        [-1.5587, -0.5849, -0.0201,  ..., -0.2185, -0.8727, -1.3916],
        [-0.0181, -0.0282,  0.0131,  ...,  0.0384,  0.0255,  0.0107],
        [ 0.9340,  1.8907, -1.3842,  ..., -1.0192, -0.4836,  0.3365]],
       requires_

## K-Fold Evalutaion

In [11]:
test = test_process()
test_dataset = MixDataset(tp.max_seq_len, w2v, c2v, phraser, label2idx,  test['name'].values, test['description'].values)
test_sampler = SequentialSampler(test_dataset)
test_loader = DataLoader(test_dataset, sampler=test_sampler, batch_size=int(tp.batch_size*2))

In [12]:
result = kfold_inference(test_loader, tp, Textcnn, './checkpoint/textcnn_mean_teacher', 5, device)
result['pred'] = result['pred_avg']
result_process(result, label2idx, './submit/textcnn_mean_teacher_5fold_avg.csv')