In [41]:
import sys
import re
import pymysql
import pandas as pd
import numpy as np
import datetime
import argparse
import csv
import logging
import os
from tqdm import tqdm,tqdm_notebook, trange
from sklearn.model_selection import train_test_split
from pyhanlp import *

from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM, BertConfig 
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule

import torch
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,TensorDataset)
from torchvision import datasets, models, transforms

import sklearn
print(sklearn.__version__)

import platform 
print(platform.python_version())

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)


In [44]:
def setup_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed) 
    torch.cuda.manual_seed_all(seed)
setup_seed(5)

In [45]:
# The name of the task to train.
TASK_NAME = '2c_wash'
fname = '2c_dataset.tsv'
fromtime = '2016-01-01'
endtime = '2019-01-10'

# The maximum total input sequence length after WordPiece tokenization.
# Sequences longer than this will be truncated, and sequences shorter than this will be padded.
MAX_SEQ_LENGTH = 128
# The input data dir. Should contain the .tsv files (or other data files) for the task.
#datapath = f'/home/wy506wd/data/{TASK_NAME}/'
datapath = os.path.join('/home/wy506wd/data/',TASK_NAME)
data_dir = datapath

# Bert pre-trained model selected in the list: bert-base-uncased, 
# bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,
# bert-base-multilingual-cased, bert-base-chinese.
BERT_MODEL = 'chinese_wwm_ext_pytorch'
#model_dir = f'/home/wy506wd/download/{BERT_MODEL}/'
model_dir = os.path.join('/home/wy506wd/download/',BERT_MODEL)


WEIGHTS_NAME = "pytorch_model.bin"
output_model_file = os.path.join(data_dir, WEIGHTS_NAME)
# if not os.path.exists(output_model_file):
#     os.mkdir(output_model_file)

# construct dataset

In [6]:

sys.path.append('./dataquery')
import DataQuery.MysqlAPI as MysqlAPI
import DataQuery.DataToolkit as DataToolkit 

#load database data
m = MysqlAPI.MysqlAPI()
table_column_dict = {'lc_news': 
                     ['InfoPublDate',
                      'InfoTitle', 
                      'Content',
                      'RecordDate',
                      'XGRQ']}

raw = m.query_jy_news_data(date_list=[[fromtime,endtime]],
                            table_column_dict=table_column_dict)
raw

KeyboardInterrupt: 

In [None]:
# extract InvolvedStock (code & name)
print(raw.shape)

def get_stockcode_from_text(text):
    i = re.search('\(\d+',text)
    if not i: return None
    num = i.group(0)[1:]
    return num

def get_InvolvedStock(df):
    df['code_num'] = df['Content'].map(get_stockcode_from_text)
    # drop None data
    df = DataToolkit.fillna_data(df,fillna_drop_narows=['code_num'])
    info_df = m.query_one_table(database='news',
                  table_name='stock_info', 
                  columns=['stock_code','display_name'])
    info_df['code_num'] = info_df['stock_code'].map(lambda x: x[:-3])
    df = pd.merge(df,info_df,how='inner')
    return df

data = get_InvolvedStock(raw)
# print(get_stockcode_from_text(text))
data

# label data

In [None]:
d = DataToolkit.fillna_data(data,fillna_drop_narows=['stock_code'])

# align_date    
def align_date(mydatetime):
    if(mydatetime.time() <= datetime.time(9,30)):
        return mydatetime.date()
    if(mydatetime.time() >= datetime.time(15,0)):
        return mydatetime.date() + datetime.timedelta(days=1)
    return None

def test_date_distribution(mydatetime):
    if(mydatetime.time() <= datetime.time(9,30)):
        return 1
    if(mydatetime.time() >= datetime.time(15,0)):
        return 2
    return 0

_ = d.shape[0]
print(d.shape)
d['RecordDate'] = d['RecordDate'].map(align_date)
d = DataToolkit.fillna_data(d,fillna_drop_narows=['RecordDate'])
print('num_news_during_marketsOpen = ',_ - d.shape[0])
d['RecordDate'].value_counts()


In [None]:
# d['RecordDate']

In [None]:
# set index
d.rename(columns={'RecordDate':'trade_date','InfoTitle':'title','Content':'content'},inplace=True)
d['trade_date'] = d['trade_date'].map(lambda x : x.strftime("%Y-%m-%d"))
d.set_index(['stock_code','trade_date'],inplace=True)
d

In [None]:
# capm.index.values

In [None]:
# d.index.values

In [None]:
# 方向标记
# merge data together
dataset = pd.DataFrame([])

for index in ['000001.SH','399107.SZ']:
    table_column_dict = {'capm': 
                         ['return_adj_d001',
                          'car_hs300_b30_d001']}

    capm = m.query_trading_data(index_code=index,
                                trade_date_list=[[fromtime,endtime]],
                                table_column_dict=table_column_dict)
    
    tmp = pd.merge(d,capm,left_index=True,right_index=True,how='inner')
    dataset = pd.concat([dataset,tmp]).drop_duplicates()
    print(tmp.shape)
    
print(dataset.shape)
dataset = dataset.dropna()

In [None]:
dataset

In [None]:
# label news according to markets data
def label_direction_3(trend):
    if abs(trend) <= 0.02:
        return 1
    return 2 if trend > 0 else 0

def label_direction_wy(trend):
    for index, i in enumerate([-0.005,0.005]):
        if trend < i:
            return index
    return index+1

def label_direction_5(trend):
    for index, i in enumerate([-0.02,-0.005,0.005,0.02]):
        if trend < i:
            return index
    #print(trend)
    return index+1
    


def label_direction_2(trend):
    return 0 if trend <= 0 else 1



dataset['label'] = dataset['return_adj_d001'].map(label_direction_2)
dataset

In [None]:
dataset['label'].value_counts()

# raw data preprocess

In [None]:
tmp = dataset[['ID','label','content']]

In [None]:
def get_Kth_paragraph(content,k=0):
    '''
    return Kth paragraph
    '''
    return content.split('\n')[k]

def clean_bracket(text):
    text =  text.replace('\n','')
    if len(text) <= MAX_SEQ_LENGTH:
        return text
    return re.sub('[\(\（]\S*?[\)\）]','',text)

def _getSummary(text):
    return '。'.join(HanLP.extractSummary(clean_bracket(text), 5))

tmp['content'] = tmp['content'].map(_getSummary)
tmp.shape

In [None]:
# uncheck_key_words('我买的股票都跌停了')

In [None]:
def uncheck_key_words(txt):
    key_words = ['跌停','跌幅','涨幅','涨停']
    #print(type(txt))
    for k in key_words:
        if txt.find(k) != -1:
            return False
    return True

def _k(x):
    return uncheck_key_words(x['content'])

logger.info(tmp.shape)
tmp = tmp[tmp['content'].apply(uncheck_key_words)]
logger.info(tmp.shape)

In [None]:
# balance train data
# 下采样平衡数据
# down sample
def lower_sample_data_by_sample(df,percent=1):
    most_data = df[df['label'] == 0]  # 多数类别的样本
    minority_data = df[df['label'] == 1]  # 少数类别的样本   
    #random sample most_data
    lower_data=most_data.sample(n=int(percent*len(minority_data)))   
    return (pd.concat([lower_data,minority_data]))
# tmp = lower_sample_data_by_sample(tmp)
# tmp

In [None]:
tmp.shape

In [None]:
a = tmp['label'].value_counts()
logger.info('\n'+ str(a))
acc_1class = a.max()/a.sum()
logger.info(acc_1class)

In [None]:
tmp['label'].mean()

In [None]:
# save raw training data
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
tmp.to_csv(os.path.join(data_dir,fname),sep='\t',index = False)
tmp.to_csv(os.path.join(data_dir,"train.tsv"),sep='\t',index = False,header=None)
tmp.shape

## convert  .csv  to  examples  to  features

In [16]:


# The output directory where the fine-tuned model and checkpoints will be written.
# OUTPUT_DIR = f'{datapath}/{TASK_NAME}/'
OUTPUT_DIR = os.path.join(datapath,TASK_NAME)

# The directory where the evaluation reports will be written to.
# REPORTS_DIR = f'reports/{TASK_NAME}_evaluation_report/'

# This is where BERT will look for pre-trained models to load parameters from.
# CACHE_DIR = 'cache/'



TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 5
RANDOM_SEED = 42
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_PROPORTION = 0.1
OUTPUT_MODE = 'classification'
CONFIG_NAME = "config.json"


In [17]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8-sig") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines

class MultiClassificationProcessor(DataProcessor):
    """Processor for binary classification dataset."""
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
    
    def get_test_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = line[2]
            label = line[1]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples

In [18]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if OUTPUT_MODE == "classification":
            label_id = label_map[example.label]
        elif OUTPUT_MODE == "regression":
            label_id = float(example.label)
        else:
            raise KeyError(OUTPUT_MODE)
            
        if ex_index < 5: # show examples like dataframe.head(5)
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


In [19]:
# load .tsv data
# dat = pd.read_csv(f'{datapath}/{fname}',delimiter="\t")
# dat

Unnamed: 0,ID,label,content
0,505245187020,1,浦发银行5日傍晚发布其2015年度业绩快报。2015年浦发银行实现营业收入1。上一年度每股收...
1,505245187037,1,同比增长18.97%。公司实现营业收入1465.43亿元。
2,505245187012,1,较上年末同比增长21.14%。 中国证券网讯 浦发银行1月4日晚间领衔披露沪市两市2015...
3,505251238389,1,较上年末同比增长21.14%。 【财经网讯】浦发银行1月4日晚间领衔披露沪深两市2015年...
4,505257374309,1,浦发银行实现归属于母公司股东的净利润505.98亿元。资产规模较2014年末增长20.19%...
5,505274641771,1,2015年浦发银行的资产规模和营业收入均达到两位数增长。资产规模较2014年末增长20.19...
6,505370423700,0,国金证券银行业分析师马鲲鹏则表示。同比增长7.6%。这也是该行为2016年供给侧改革将驱...
7,505533632486,0,资产规模较2014年末增长20.19%。浦发银行在支持实体经济增长的过程中。金融市场业务实现...
8,505533632480,0,为用户的移动支付带来全新的便捷体验与更高的金融安全。浦发银行长期以来专注于移动支付创新的服务...
9,506138464264,1,浦发银行第九次全行志愿者日活动。 本次志愿者活动以。 作为业内率先开展全行志愿者日活动并...


In [20]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained(os.path.join('/home/wy506wd/download/',BERT_MODEL))
processor = MultiClassificationProcessor()

# convert examples 2 features
train_examples = processor.get_train_examples(data_dir)
train_examples_len = len(train_examples)
label_list = processor.get_labels() # [0, 1] for binary classification
num_labels = len(label_list)

num_train_optimization_steps = int(
    train_examples_len / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS

train_features = convert_examples_to_features(train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)

11/04/2019 21:46:00 - INFO - pytorch_transformers.tokenization_utils -   Model name '/home/wy506wd/download/chinese_wwm_ext_pytorch' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc). Assuming '/home/wy506wd/download/chinese_wwm_ext_pytorch' is a path or url to a directory containing tokenizer files.
11/04/2019 21:46:00 - INFO - pytorch_transformers.tokenization_utils -   Didn't find file /home/wy506wd/download/chinese_wwm_ext_pytorch/added_tokens.json. We won't load it.
11/04/2019 21:46:00 - INFO - pytorch_transformers.tokenization_utils -   Didn't find file /home/wy506wd/download/chinese_wwm_ext_pytorch/spe

11/04/2019 21:46:01 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11/04/2019 21:46:01 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11/04/2019 21:46:01 - INFO - __main__ -   label: 1 (id = 1)


# create  model

In [21]:
class BertLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-12):
        """Construct a layernorm module in the TF style (epsilon inside the square root).
        """
        super(BertLayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.weight * x + self.bias

class BertForSequenceClassification(nn.Module):
    def __init__(self, num_labels=2):
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained(model_dir)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        nn.init.xavier_normal_(self.classifier.weight)
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits
    
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [22]:
config = BertConfig(os.path.join(model_dir, "config.json"))
#os.path.join(model_dir, "train.tsv")
model = BertForSequenceClassification(num_labels)

11/04/2019 21:46:33 - INFO - pytorch_transformers.modeling_utils -   loading configuration file /home/wy506wd/download/chinese_wwm_ext_pytorch/config.json
11/04/2019 21:46:33 - INFO - pytorch_transformers.modeling_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 21128
}

11/04/2019 21:46:33 - INFO - pytorch_transformers.modeling_utils -   loading

In [23]:
device = torch.device("cuda:5" if torch.cuda.is_available() else "cpu")
model.to(device)

#model.freeze_bert_encoder()
#model.classifier.weight.requires_grad = True

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_aff

In [24]:
learning_rate = 5e-5
adam_epsilon = 1e-8
warmup_steps = 1
weight_decay = 0.01


In [25]:
global_step = 0
nb_tr_steps = 0
tr_loss = 0

In [26]:
logger.info("***** Running training *****")
logger.info("  Num examples = %d", train_examples_len)
logger.info("  Batch size = %d", TRAIN_BATCH_SIZE)
logger.info("  Num steps = %d", num_train_optimization_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

if OUTPUT_MODE == "classification":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
elif OUTPUT_MODE == "regression":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)


11/04/2019 21:46:47 - INFO - __main__ -   ***** Running training *****
11/04/2019 21:46:47 - INFO - __main__ -     Num examples = 44091
11/04/2019 21:46:47 - INFO - __main__ -     Batch size = 16
11/04/2019 21:46:47 - INFO - __main__ -     Num steps = 13775


# Training

In [27]:
# train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
# train_sampler = RandomSampler(train_data)
# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

In [28]:
def train_func(sub_train_,model):
    model.train()
    # Train the model
    train_loss = 0
    train_acc = 0
    labels = []
    preds = []
    
    train_sampler = RandomSampler(sub_train_)
    data = DataLoader(sub_train_, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

    for i, batch in enumerate(tqdm(data,desc='TRAIN')):
        optimizer.zero_grad()
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        #print(input_ids)
        output = model(input_ids, segment_ids, input_mask, labels=None)
        loss = criterion(output, label_ids)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == label_ids).sum().item()
        #print(f'\r{i*TRAIN_BATCH_SIZE/len(sub_train_)}')
        
        labels += label_ids.cpu().numpy().tolist()
        preds += output.argmax(1).cpu().numpy().tolist()
        #break
    # Adjust the learning rate
    scheduler.step()
    # eval
    result = get_eval_report(TASK_NAME, labels, preds)
    result['loss'] = train_loss / len(sub_train_)
    result['acc'] = train_acc / len(sub_train_)
    show_eval_report(result,'train')
    return result


from sklearn.metrics import matthews_corrcoef, confusion_matrix, multilabel_confusion_matrix

def get_eval_report(task_name, labels, preds):
    assert len(preds) == len(labels)
    # processor.get_labels()
    mcm = multilabel_confusion_matrix(labels, preds, labels=list(range(num_labels)))
    return {
        "task": task_name,
        "mcm": mcm
    }

def show_eval_report(result,name = ''):
    logger.info("***** Eval %s results *****" %name )
    for key in (result.keys()):
        logger.info("  %s = %s", key, str(result[key]))
        
def test(data_, model):
    model.eval()
    total_loss = 0
    total_acc = 0
    labels = []
    preds = []
    sampler = SequentialSampler(data_)
    data = DataLoader(data_, sampler=sampler, batch_size=TRAIN_BATCH_SIZE)
    for batch in data:
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        with torch.no_grad():
            output = model(input_ids, segment_ids, input_mask, labels=None)
            loss = criterion(output, label_ids)
            total_loss += loss.item()
            total_acc += (output.argmax(1) == label_ids).sum().item()
        labels += label_ids.cpu().numpy().tolist()
        #print(output.argmax(1).cpu().numpy().tolist())
        preds += output.argmax(1).cpu().numpy().tolist()
    
    result = get_eval_report(TASK_NAME, labels, preds)
    result['loss'] = total_loss / len(data_)
    result['acc'] = total_acc / len(data_)
    show_eval_report(result,'test')      
    return result

In [29]:
import time
from torch.utils.data.dataset import random_split



train_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

train_len = int(len(train_dataset) * 0.9)
test_len = len(train_dataset) - train_len
sub_train_, sub_test_ = random_split(train_dataset, [train_len, test_len])

min_test_loss = float('inf')
best_acc = -float('inf')

train_results = []
test_results = []

In [30]:
pd.DataFrame(all_label_ids.cpu().numpy())[0].value_counts(sort=False,normalize=True)

0    0.604069
1    0.395931
Name: 0, dtype: float64

In [31]:
# LOSS function
criterion = torch.nn.CrossEntropyLoss(weight=torch.Tensor([4,6])).to(device)
# criterion = torch.nn.CrossEntropyLoss().to(device)
# Prepare optimizer and schedule (linear warmup and decay)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, eps=adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps)

#print(optimizer)
#model.freeze_bert_encoder()


In [32]:
#print([param.requires_grad for param in model.bert.parameters()])

In [33]:
N_EPOCHS = 3
for epoch in range(N_EPOCHS):
    logger.info('_____ Epoch %s begin _____'% epoch)
    res = train_func(sub_train_,model)
    train_results.append(res)
    
    res = test(sub_test_,model)
    test_results.append(res)
    test_acc = res['acc']
    if test_acc > best_acc:
        logger.info("@@@@@ Save best model @@@@@")
        logger.info('file_path = %s'% output_model_file)
        best_acc = test_acc
        acc_idx = len(test_results)-1
        torch.save(model, output_model_file)
        #model.config.to_json_file(output_config_file)
        #tokenizer.save_vocabulary(OUTPUT_DIR)
logger.info("_____ Train finish _____")


11/04/2019 21:46:48 - INFO - __main__ -   _____ Epoch 0 begin _____
TRAIN: 100%|██████████| 2481/2481 [09:32<00:00,  3.22it/s]
11/04/2019 21:56:21 - INFO - __main__ -   ***** Eval train results *****
11/04/2019 21:56:21 - INFO - __main__ -     task = 2c_wash
11/04/2019 21:56:21 - INFO - __main__ -     mcm = [[[15652    28]
  [23957    44]]

 [[   44 23957]
  [   28 15652]]]
11/04/2019 21:56:21 - INFO - __main__ -     loss = 0.07767115867680766
11/04/2019 21:56:21 - INFO - __main__ -     acc = 0.39555454751644364
11/04/2019 21:56:39 - INFO - __main__ -   ***** Eval test results *****
11/04/2019 21:56:39 - INFO - __main__ -     task = 2c_wash
11/04/2019 21:56:39 - INFO - __main__ -     mcm = [[[1776    1]
  [2633    0]]

 [[   0 2633]
  [   1 1776]]]
11/04/2019 21:56:39 - INFO - __main__ -     loss = 0.07555383591154535
11/04/2019 21:56:39 - INFO - __main__ -     acc = 0.40272108843537413
11/04/2019 21:56:39 - INFO - __main__ -   @@@@@ Save best model @@@@@
11/04/2019 21:56:39 - INFO - _

In [None]:
from matplotlib import pyplot as plt 
#%matplotlib notebook 

  
def show_map(indicator):
    x = np.arange(0,len(train_results)) 
    y_train = [res[indicator] for res in train_results]
    y_test = [res[indicator] for res in test_results]
    plt.title(f"{indicator} trend") 
    plt.xlabel("Epoch") 
    plt.ylabel(indicator) 
    if indicator != 'loss':
        plt.ylim((0,1.0))
    plt.plot(x,y_train,color='red',label='train') 
    plt.plot(x,y_test,color='blue',label='test')
    plt.legend()
    plt.show()


def get_precision(np):
    tn, fp, fn, tp = np
    return tp/(tp+fp)

def get_recall(np):
    tn, fp, fn, tp = np
    return tp/(tp+fn)

def get_F1_score(np):
    tn, fp, fn, tp = np
    P_precision = tp/(tp+fp)
    P_recall = tp/(tp+fn)
    return 2*(P_precision*P_recall)/(P_precision+P_recall)

eval_functions = {
    'precision' : get_precision,
    'recall' : get_recall,
    'F1_score' : get_F1_score
}
def plt_class(k):
    x = np.arange(0,len(train_results)) 
    for name, func in eval_functions.items():
        y_train = [func(res['mcm'][k].ravel()) for res in train_results]
        y_test = [func(res['mcm'][k].ravel()) for res in test_results]
        plt.title(f"CLASS {k} {name}") 
        plt.xlabel("Epoch") 
        plt.ylabel(f'{name}') 
        plt.ylim((0,1.0))
        plt.plot(x,y_train,color='red',label='train') 
        plt.plot(x,y_test,color='blue',label='test')
        plt.legend()
        plt.show()
        

# logger.info(acc_1class)
show_map('acc')
show_map('loss')



for i in range(2):
    plt_class(i)

In [40]:

# print('true_label')
# _, _, o = dev_dataset
# b = pd.DataFrame(o.cpu().numpy())
# print(b[0].value_counts(sort=False, normalize=True).sort_index())

# print('model_preds')
# a = pd.DataFrame(preds)
# print(a[0].value_counts(sort=False,normalize=True).sort_index())

# acc_idx
idx = acc_idx
print('dev')
print('acc = %.3f'% test_results[idx]['acc'])
for i in range(2):
    print('class %s Precision = %.3f'%(i,get_precision(test_results[idx]['mcm'][i].ravel())))


dev
acc = 53.1%
class 0 Precision = 55.8%
class 1 Precision = 51.6%


# save trained model

In [None]:
savename = 'unfreeeze_summary'
assert False

# save model
# model_to_save = model  # Only save the model it-self

# If we save using the predefined names, we can load using `from_pretrained`
# output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME)
# output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME)

torch.save(model, os.path.join(OUTPUT_DIR, f'{savename}.bin'))


In [None]:
# load model 
swing_model = torch.load(output_model_file)
#swing_model = torch.load(os.path.join(OUTPUT_DIR, '8E_W.bin'))
swing_model.to(device)

# # test model
# valid_loss, valid_acc = test(sub_valid_, model)
# print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

# output result

In [None]:
def tsv_2_dataset():
    # Load pre-trained model tokenizer (vocabulary)
    logger.info('Load pre-trained model tokenizer (vocabulary)')
    tokenizer = BertTokenizer.from_pretrained(f'/home/wy506wd/download/{BERT_MODEL}/')
    processor = MultiClassificationProcessor()

    # convert examples 2 features
    logger.info('convert examples 2 features')
    dev_examples = processor.get_train_examples(data_dir)
    dev_examples_len = len(dev_examples)
    label_list = processor.get_labels() # [0, 1] for binary classification
    num_labels = len(label_list)
    dev_features = convert_examples_to_features(dev_examples, label_list, MAX_SEQ_LENGTH, tokenizer)

    all_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long)

    if OUTPUT_MODE == "classification":
        all_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long)
    elif OUTPUT_MODE == "regression":
        all_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.float)



In [None]:

# def old_predict(model):
#     all_predict_ids = []
#     #model.cpu()
#     model.to(device)
#     model.eval()
#     all_input_ids1 = all_input_ids.to(device)
#     all_segment_ids1 = all_segment_ids.to(device)
#     all_input_mask1 = all_input_mask.to(device)
#     all_label_ids1 = all_label_ids.to(device)
#     #print(all_input_ids1.is_cuda)
#     with torch.no_grad():
#         output = model(all_input_ids1, all_segment_ids1, all_input_mask1, labels=None)
#     return output.argmax(1)

def predict(data_, model):
    model.eval()

    preds = []
    sampler = SequentialSampler(data_)
    data = DataLoader(data_, sampler=sampler, batch_size=TRAIN_BATCH_SIZE)
    for batch in tqdm(data):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        with torch.no_grad():
            output = model(input_ids, segment_ids, input_mask, labels=None)
            
        preds += output.argmax(1).cpu().numpy().tolist()
    return preds

train_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
all_text_a = [e.text_a for e in train_examples]
all_predict_ids = predict(train_dataset,model)
ans = pd.DataFrame({'text':all_text_a, 'label':all_label_ids, 'predict': all_predict_ids})
ans






In [None]:
print(type(ans['label'][0]))

In [None]:
from pyhanlp import *
import collections

NShortSegment = JClass("com.hankcs.hanlp.seg.NShort.NShortSegment")
ViterbiSegment = JClass("com.hankcs.hanlp.seg.Viterbi.ViterbiSegment")

nshort_segment = NShortSegment().enableCustomDictionary(False).enablePlaceRecognize(
    True).enableOrganizationRecognize(True)
shortest_segment = ViterbiSegment().enableCustomDictionary(
    False).enablePlaceRecognize(True).enableOrganizationRecognize(True)

# a = nshort_segment.seg(sentence)
# print([i.word for i in a])

CoreStopWordDictionary = JClass("com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary")
# CoreStopWordDictionary.apply(a)
# print([i.word for i in a])

# counts = [collections.Counter() for _ in range(2)]
counts = collections.Counter()

def count_words(text):
    words = nshort_segment.seg(text)
    CoreStopWordDictionary.apply(words)
    word_list = [w.word for w in words]
    counts.update(word_list)
    return word_list

topN = 128

#ans.apply(count_words)
ans['text'].map(count_words)
print(len(counts))
total_counts = counts
common_words = {i[0] for i in counts.most_common(topN)}
print('common_words:')
print(common_words)
predict_words = []
for i in range(2):
    counts = collections.Counter()
    ans[ans['predict']==i]['text'].map(count_words)
    predict_words.append({i[0] for i in counts.most_common(topN)})
    print(f'predict = {i}')
    print(predict_words[i] - common_words)

In [None]:
# cross
print(predict_words[0] - predict_words[1])
print(predict_words[1] - predict_words[0])

In [None]:
common_words = {i[0] for i in counts.most_common(64)}
common_words

In [None]:
# from pprint import pprint
print(counts.most_common(120))

In [None]:
wrong = ans[ans['label']!=ans['predict']]
wrong.to_csv(os.path.join(data_dir, f"{savename}_wrong.tsv"),sep='\t')
wrong

In [None]:
swing = ans[ans['predict']==1]
swing.to_csv(os.path.join(data_dir, f"{savename}_swing.tsv"),sep='\t')
swing

In [None]:
ans.to_csv(os.path.join(data_dir, f"{savename}_predict.tsv"),sep='\t')

# 实验

In [None]:
list_of_labels = [torch.from_numpy(np.array(1))]
list_of_labels