In [None]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.9-py2.py3-none-any.whl (242 kB)
[K     |████████████████████████████████| 242 kB 4.4 MB/s 
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.9


In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import re
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import random

In [None]:
path = {
    'keyword_chem':'drive/MyDrive/AI-Cup/Train data/Keywords/02chem.list.xlsx',
    'keyword_crop':'drive/MyDrive/AI-Cup/Train data/Keywords/02crop.list.xlsx',
    'keyword_pest':'drive/MyDrive/AI-Cup/Train data/Keywords/02pest.list.xlsx',
    'train_dataset':'drive/MyDrive/AI-Cup/Train data/dataTrainComplete/',
    'train_label':'drive/MyDrive/AI-Cup/Train data/TrainLabel.csv',
    'public_dataset':'drive/MyDrive/AI-Cup/public data/dataPublicComplete/',
    'public_BM25':'drive/MyDrive/AI-Cup/public_data_BM25.csv',
    'process_train_data':'drive/MyDrive/AI-Cup/train_data.csv',
    'process_public_data':'drive/MyDrive/AI-Cup/public_data.csv',
}
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
bert_model = BertForSequenceClassification.from_pretrained("bert-base-chinese")
max_length = 512

Downloading:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/269k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Data

In [None]:
def make_keywords_dict(df):
    for i in df.index:
        keywords.append(df.iloc[i,0])
        for j in df.iloc[i,:].index:
            if df.isna().iloc[i,j]:
                break
            keywords_dict[df.iloc[i,j]] = df.iloc[i,0]

keywords = []
keywords_dict = {}
make_keywords_dict(pd.read_excel(path['keyword_chem'], index_col=None, header=None))
make_keywords_dict(pd.read_excel(path['keyword_crop'], index_col=None, header=None))
make_keywords_dict(pd.read_excel(path['keyword_pest'], index_col=None, header=None))

In [None]:
train_label = pd.read_csv(path['train_label'])
train_label = [[train_label['Test'][k], train_label['Reference'][k]] for k in range(len(train_label['Test']))]
print(train_label[:5], '\n')

[[3, 415], [3, 649], [9, 5], [25, 32], [25, 41]] 



In [None]:
def get_keys_clean(txt):
    
    txt = re.sub(r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*","",txt)
    txt = re.sub(r"(\d{2,3}-?|\(\d{2,3}\))\d{3,4}-?\d{4}","",txt)
    txt = re.sub(r"\d+","1",txt)
    txt = txt.replace('/fontdivid=filelink1fontsize=1face="新細明體"請點選檔名開啟或下載(依檔案大小開啟速度不同)：',"")
    txt = re.sub("\【.*?】+|\《.*?》+|\#.*?#+|[.!/_,$&%^*()<>+""‘[email protected]|:~{}#]+|[——！\，。=？、：“”‘’￥……（）《》【】]", '', txt)

    for k in keywords_dict:
        txt = txt.replace(k, keywords_dict[k])
    keys = []
    for k in keywords:
        if k in txt:
            keys.append(k)
    
    txt_list = re.split('，|。|\n', txt)
    txt = []
    for st in txt_list:
        for k in keys:
            if k in st:
                txt.append(st)
                break
    
    #txt = txt.replace("\n", "")
    txt = "、".join(keys) + "。" + "".join(txt)

    return keys, txt[:max_length//2-2]

def get_data(datapath):
    paths = os.listdir(path[datapath])
    paths.sort()
    data = {}
    for p in paths:
        tmp = open(path[datapath] + p).read()
        tmp = get_keys_clean(tmp)
        data[int(p[:-4])] = tmp
    return data

train_data = get_data('train_dataset')
public_data = get_data('public_dataset')

In [None]:
print(train_data[3])
print(train_data[415])
print(train_data[649])

(['性費洛蒙', '蘇力菌', '微生物製劑', '青蔥', '蔬菜', '斜紋夜蛾', '甜菜夜蛾', '蛾類', '夜蛾類'], '性費洛蒙、蘇力菌、微生物製劑、青蔥、蔬菜、斜紋夜蛾、甜菜夜蛾、蛾類、夜蛾類。夏季為斜紋夜盜及甜菜夜蛾發生盛期請加強注意及時防治避免損失夏日乾燥炎熱的氣候正是許多農作物害蟲好發的時機花蓮區農業改良場籲請農民注意斜紋夜盜及甜菜夜蛾的防治以免造成作物損失隨著氣溫增高許多農作物害蟲也隨之變得活躍由監測數據顯示宜蘭地區開始出現夜蛾類害蟲的高峰其中包括斜紋夜蛾和甜菜夜蛾斜紋夜蛾一般人稱「斜紋夜蛾」其食性幾乎是見綠就吃是種植蔬菜的農友最傷腦筋的害蟲之一另外青青蔥產區除了斜紋夜蛾之外還有甜菜夜蛾危害孵化的幼蟲藏匿於青蔥')
(['性費洛蒙', '蘇力菌', '窄域油', '青蔥', '甘藷', '玉米', '大蒜', '茼蒿', '落花生', '田菁', '毛豆', '豌豆', '蓮花', '花卉', '葉菜類', '綠肥作物', '斜紋夜蛾', '甜菜夜蛾', '白粉病', '蛾類', '夜蛾類'], '性費洛蒙、蘇力菌、窄域油、青蔥、甘藷、玉米、大蒜、茼蒿、落花生、田菁、毛豆、豌豆、蓮花、花卉、葉菜類、綠肥作物、斜紋夜蛾、甜菜夜蛾、白粉病、蛾類、夜蛾類。目前氣候乾燥及正值季節轉換期間田間夜蛾類害蟲密度逐漸上升尤以落落花生田區發生嚴重同時天氣漸轉冷涼早晚溫差變大白粉病發生機率提高為確保作物正常生長及良好收成臺南區農業改良場籲請農友注意夜蛾類害蟲及白粉病防治常見危害夜蛾有斜紋夜蛾及甜菜夜蛾兩者幼蟲皆晝伏夜出雜食性可危害作物如葉菜類茼蒿落落花生田菁毛豆豌豆大蒜青青蔥甘藷玉米蓮花及花卉等數十種作物幼蟲食量極大')
(['性費洛蒙', '蘇力菌', '青蔥', '甘藷', '玉米', '大蒜', '茼蒿', '落花生', '田菁', '毛豆', '豌豆', '蓮花', '番茄', '花卉', '葉菜類', '雜糧', '園藝作物', '斜紋夜蛾', '甜菜夜蛾', '螟蛾', '斑螟蛾', '蛾類', '番茄夜蛾', '夜蛾類'], '性費洛蒙、蘇力菌、青蔥、甘藷、玉米、大蒜、茼蒿、落花生、田菁、毛豆、豌豆、蓮花、番茄、花卉、葉菜類、雜糧、園藝作物、斜紋夜蛾、甜菜夜蛾、螟蛾、斑螟蛾、蛾類、番茄夜蛾、夜蛾類。斜紋夜蛾密度升高為確保作物收成台南區農改場籲請

In [None]:
def get_coarse_label(data):
    label = []
    for i in data:
        for j in data:
            if i is j:
                continue
            same_key = [k for k in data[i][0] if k in data[j][0]]
            if len(same_key) >= min(len(data[i][0]), len(data[j][0])) * 9 // 10:
                label.append([i, j])
    return label

train_clabel = get_coarse_label(train_data)
public_clabel = get_coarse_label(public_data)
print('train_clabel: ', len(train_clabel), ', public_clabel: ', len(public_clabel))

train_clabel:  16460 , public_clabel:  7744


In [None]:
true_label = [L for L in train_clabel if L in train_label]
print('train_clabel: ', len(train_clabel), ', train_label: ', len(train_label), ', true_label: ', len(true_label))

train_clabel:  16460 , train_label:  1383 , true_label:  1343


In [None]:
class Dataset(Dataset):
    def __init__(self, data, label, clabel, tokenizer):
        self.data = data
        self.label = label
        self.clabel = clabel
        self.tokenizer = tokenizer
        self.len = len(self.clabel)
        
    def __getitem__(self, idx):
        test, refer = self.clabel[idx]
        
        x_token = tokenizer.encode_plus(self.data[test][1], self.data[refer][1], 
                                        add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length')
        x_input = x_token['input_ids']
        x_type = x_token['token_type_ids']
        
        if self.label is None:
            return (x_input, x_type, 0)
        
        # output
        if self.clabel[idx] in self.label:
            y = 1
        else:
            y = 0
        return (x_input, x_type, y)
    
    def __len__(self):
        return self.len

random.shuffle(train_clabel)
train_dataset = Dataset(data=train_data, label=train_label, clabel=train_clabel, tokenizer=tokenizer)
public_dataset = Dataset(data=public_data, label=None, clabel=public_clabel, tokenizer=tokenizer)

In [None]:
def get_batch(sample):
    x_input = torch.tensor([s[0] for s in sample])
    x_type = torch.tensor([s[1] for s in sample])
    y = torch.tensor([s[2] for s in sample])
    return x_input, x_type, y

train_loader = DataLoader(train_dataset, batch_size=16, collate_fn=get_batch)
public_loader = DataLoader(public_dataset, batch_size=16, collate_fn=get_batch)

# Train

In [None]:
def predict(model, dataloader):
    predictions = None
    TP = 0
    FP = 0
      
    with torch.no_grad():
        for data in dataloader:
            # use GPU
            if torch.cuda.is_available():
                data = [t.to("cuda") for t in data]
            
            # get predictions
            x_input, x_type = data[:2]
            pre = model(input_ids=x_input, token_type_ids=x_type)
            pre = torch.tensor([torch.argmax(p) for p in pre.logits])
            if torch.cuda.is_available():
                pre = pre.to("cuda")
            
            # compute accuracy
            y = data[2]
            for i in range(len(pre)):
                if y[i] == 1:
                    if pre[i] == y[i]:
                        TP += 1
                    else:
                        FP += 1
            
            # add predictions batch
            if predictions is None:
                predictions = pre
            else:
                predictions = torch.cat((predictions, pre))
    
    return predictions, TP, FP

In [None]:
if torch.cuda.is_available():
    bert_model.to('cuda')

optimizer = torch.optim.Adam(bert_model.parameters(), lr=1e-5)

EPOCHS = 3
for epoch in range(EPOCHS):
    bert_model.train()
    running_loss = 0.0
    for data in train_loader:
        if torch.cuda.is_available():
            data = [t.to("cuda") for t in data]
        optimizer.zero_grad()
        
        x_input, x_type, y = data
        outputs = bert_model(input_ids=x_input, token_type_ids=x_type, labels=y)

        # backward
        outputs.loss.backward()
        optimizer.step()

        running_loss += outputs.loss.item()
        
    # 計算分類準確率
    _, TP, FP = predict(bert_model, train_loader)
    precision = TP / (TP+FP)
    recall = TP / len(train_label)
    F1 = (2 * precision * recall) / (precision + recall)
    print('[epoch %d] loss: %.3f, precision: %.3f, recall: %.3f, F1: %.3f' %
          (epoch+1, running_loss, precision, recall, F1))
    
    bert_model.eval()
    pre, _, _ = predict(bert_model, public_loader)
    test = [public_clabel[i][0] for i in range(len(pre)) if pre[i]==1]
    refer = [public_clabel[i][1] for i in range(len(pre)) if pre[i]==1]
    print('test_len: ', len(test))
    output_dict = {'Test': test, 'Reference': refer}
    output_df = pd.DataFrame(output_dict)
    output_df.to_csv("submit" + str(epoch+1) + ".csv", index=False, sep=",")
    
    torch.save(bert_model, 'model')

[epoch 1] loss: 129.856, precision: 0.841, recall: 0.817, F1: 0.829
test_len:  615
[epoch 2] loss: 69.328, precision: 0.948, recall: 0.920, F1: 0.934
test_len:  728
[epoch 3] loss: 50.442, precision: 0.953, recall: 0.926, F1: 0.939
test_len:  687
