In [1]:
import json
import os
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm
import random
import re

In [2]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# 数据预处理

In [4]:
tokenizer = T5Tokenizer.from_pretrained('./t5-base/')

In [5]:
# 获取字典
CV_topic_dic = {}
with open("./LectureBank/CV/CV.topics.tsv",'r',encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip().split("\t")
        CV_topic_dic[line[0]] = line[1]
CV_topic_dic

{'1': 'Artificial Intelligence',
 '2': 'Cognitive Science and Neuroscience',
 '3': 'Image Processing',
 '4': 'Computer graphics',
 '5': 'Pattern Recognition or Machine Learning',
 '6': 'Image Representation',
 '7': 'Stereo Matching and 3D Reconstruction',
 '8': 'Motion Detection and Tracking',
 '9': 'edge detection',
 '10': 'segmentation',
 '11': 'contour and silhouette',
 '12': 'texture',
 '13': 'feature extraction',
 '14': 'local features or blob',
 '15': 'Camera calibration or resectioning',
 '16': 'Image Registration',
 '17': 'feature matching',
 '18': 'Background Subtraction',
 '19': 'background modeling and update',
 '20': 'color space',
 '21': 'Hue',
 '22': 'Saturation',
 '23': 'Color Constancy',
 '24': 'illumination',
 '25': 'Reflectance Model',
 '26': 'Shading Analysis',
 '27': 'Imaging Geometry and Physics',
 '28': 'Perspective projection',
 '29': 'radiance',
 '30': 'irradiance',
 '31': 'intensity',
 '32': 'diffuse surface',
 '33': 'Specular Surfaces',
 '34': 'interreflection

In [6]:
# 获取字典
BIO_topic_dic = {}
with open("./LectureBank/BIO/BIO.topics.tsv",'r',encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip().split("\t")
        BIO_topic_dic[line[0]] = line[1]
BIO_topic_dic

{'1': 'conservation',
 '2': 'central dogma',
 '3': 'transcription',
 '4': 'translation',
 '5': 'DNA',
 '6': 'RNA',
 '7': 'protein',
 '8': 'single nucleotide polymorphism',
 '9': 'quantitative trait loci',
 '10': 'isoform',
 '11': 'CpG island',
 '12': 'transcription factor',
 '13': 'sequence alignment',
 '14': 'genome assembly',
 '15': 'motif discovery',
 '16': 'gene finding',
 '17': 'molecular evolution',
 '18': 'genome-wide association studies',
 '19': 'protein secondary structure',
 '20': 'protein tertriary structure',
 '21': 'RNA secondary structure',
 '22': 'differential expression',
 '23': 'binding site',
 '24': 'regulatory network',
 '25': 'protein-protein interaction',
 '26': 'microarray',
 '27': 'RNA-seq',
 '28': 'ChIP-seq',
 '29': 'yeast 2-hybrid',
 '30': 'shotgun sequencing',
 '31': 'position weight matrix',
 '32': 'BLAST',
 '33': 'de Bruijin graph',
 '34': 'irreproducible discovery rate',
 '35': 'DESeq',
 '36': 'phylogenetic tree',
 '37': 'bayesian inference',
 '38': 'multiv

In [7]:
# with open("./wiki80/wiki80_train.txt",'r',encoding='utf-8') as f:
#         lines = f.readlines()
#         for line in lines:
#             data = json.loads(line)
#             h_entity_replace = randomLetter()
#             t_entity_replace = randomLetter()
#             plus_text = data['token'][0:data['h']['pos'][0]] + [h_entity_replace] +  data['token'][data['h']['pos'][1]:data['t']['pos'][0]] + [t_entity_replace] + data['token'][data['h']['pos'][1]:] 
#             plus_text = " ".join(plus_text)
#             plus_text = "extract relation: " + plus_text.lower() + " </s>"
#             rel_text = h_entity_replace.lower() + " - " +  data['relation'].lower() + ' - ' + t_entity_replace.lower() + " </s>"
#             print(plus_text)
#             print(rel_text)
#             break

In [8]:
def extract_data(filepath,tokenizer):
    origin_texts = []
    rel_texts = []
    max_input_len = 0
    max_output_len = 0
    with open(filepath,'r',encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip().split(",")
            lecture1 = CV_topic_dic[line[0]]
            lecture2 = CV_topic_dic[line[1]]
            text = "judge prerequisite: " + lecture1 + " " + lecture2 + " </s>"
            if line[2] == "0":
                rel_text = "0"
            elif line[2] == "1":
                rel_text = "1"
            #rel_text = lecture1 + " - " +  rel + ' - ' + lecture2 + " </s>"
            origin_texts.append(text)
            rel_texts.append(rel_text)
            tokenized_inp = tokenizer.encode_plus(text, return_tensors="pt")
            tokenized_output = tokenizer.encode_plus(rel_text,return_tensors="pt")
            input_ids  = tokenized_inp["input_ids"]
            output_ids  = tokenized_output["input_ids"]
            max_input_len = max(max_input_len, input_ids.shape[1])
            max_output_len = max(max_output_len, output_ids.shape[1])
            
            # 数据增强
            # 用两个随机字母替换两个实体
#             h_entity_replace = randomLetter()
#             t_entity_replace = randomLetter()
#             first_entity_pos = []
#             second_entity_pos = []
#             if data['h']['pos'][0] < data['t']['pos'][0]:
#                 first_entity_pos = data['h']['pos']
#                 second_entity_pos = data['t']['pos']
#             else:
#                 first_entity_pos = data['t']['pos']
#                 second_entity_pos = data['h']['pos']
#             plus_text = data['token'][0:first_entity_pos[0]] + [h_entity_replace] +  data['token'][first_entity_pos[1]:second_entity_pos[0]] + [t_entity_replace] + data['token'][second_entity_pos[1]:] 
#             plus_text = " ".join(plus_text)
#             plus_text = "extract relation: " + plus_text.lower() + " </s>"
#             rel_text = h_entity_replace.lower() + " - " +  data['relation'].lower() + ' - ' + t_entity_replace.lower() + " </s>"
#             print(plus_text)
#             print(rel_text)
#             origin_texts.append(plus_text)
#             rel_texts.append(rel_text)
    print(max_input_len)
    print(max_output_len)
    return origin_texts,rel_texts

In [9]:
def extract_val_data(filepath,tokenizer):
    origin_texts = []
    rel_texts = []
    max_input_len = 0
    max_output_len = 0
    with open(filepath,'r',encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip().split(",")
            lecture1 = BIO_topic_dic[line[0]]
            lecture2 = BIO_topic_dic[line[1]]
            text = "judge prerequisite: " + lecture1 + " " + lecture2 + " </s>"
            if line[2] == "0":
                rel_text = "0"
            elif line[2] == "1":
                rel_text = "1"
            # rel_text = lecture1 + " - " +  rel + ' - ' + lecture2
            origin_texts.append(text)
            rel_texts.append(rel_text)
    return origin_texts,rel_texts

In [10]:
class Extract_Dataset(Dataset):
    def __init__(self, filepaths, tokenizer,max_input_len,max_output_len):
        self.origin_texts, self.rel_texts = [],[]
        for filepath in filepaths:
            o,r = extract_data(filepath,tokenizer)
            self.origin_texts.extend(o)
            self.rel_texts.extend(r)
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len
        
    def __len__(self):
        return len(self.origin_texts)
    
    def __getitem__(self, index):
        tokenized_input = tokenizer.encode_plus(self.origin_texts[index], max_length=self.max_input_len, pad_to_max_length=True, return_tensors="pt")
        tokenized_output = tokenizer.encode_plus(self.rel_texts[index], max_length=self.max_output_len, pad_to_max_length=True, return_tensors="pt")
        
        input_ids  = tokenized_input["input_ids"].squeeze()
        attention_mask = tokenized_input["attention_mask"].squeeze()

        output_ids = tokenized_output["input_ids"].squeeze()
        decoder_attention_mask=  tokenized_output["attention_mask"].squeeze()
        
        data = {
            'input_ids':input_ids,
            'attention_mask':attention_mask,
            'output_ids':output_ids,
            'decoder_attention_mask':decoder_attention_mask
        }
        return data

# 训练

In [11]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
t5_model = T5ForConditionalGeneration.from_pretrained('./t5-base/').to(DEVICE)

In [12]:
# optimizer
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in t5_model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in t5_model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-4)

In [13]:
def train(model, device, train_loader, optimizer):   # 训练模型
    model.train()
    with tqdm(total=len(train_loader)) as bar:
        for idx,data in enumerate(train_loader):
            input_ids, attention_mask, output_ids, decoder_attention_mask = data['input_ids'].to(device), data['attention_mask'].to(device), data['output_ids'].to(device), data['decoder_attention_mask'].to(device)
            output = model(input_ids=input_ids, labels=output_ids,decoder_attention_mask=decoder_attention_mask,attention_mask=attention_mask)
            loss = output[0]
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            bar.set_postfix(loss=loss.item())
            bar.update(1)

In [19]:
def val(model, tokenizer,device, filepaths):
    origin_texts, rel_texts = [], []
    for filepath in filepaths:
        o,r= extract_val_data(filepath,tokenizer)
        origin_texts.extend(o)
        rel_texts.extend(r)
    model.eval()
    TP = 0
    FP = 0
    FN = 0
    with tqdm(total=len(origin_texts)) as bar:
        for idx, text in enumerate(origin_texts):
            input_ids = tokenizer(text, return_tensors='pt').input_ids.to(DEVICE)
            outputs = t5_model.generate(input_ids)
            result = tokenizer.decode(outputs[0], skip_special_tokens=True)
            rel_text = rel_texts[idx]
#             print(rel_text)
#             print(result)
            if result == "1":
#                 print(1)
                if result == rel_text:
                    TP += 1
                else:
                    FP += 1
            else:
                if result != rel_text:
                    FN += 1
            bar.update(1)
    print(TP)
    print(FP)
    print(FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1 = 2 * precision * recall / (precision + recall)

    return precision,recall,F1

In [15]:
filepaths = ["./LectureBank/CV/train.0.csv","./LectureBank/CV/train.1.csv","./LectureBank/CV/train.2.csv","./LectureBank/CV/train.3.csv","./LectureBank/CV/train.4.csv"]
train_dataset = Extract_Dataset(filepaths=filepaths,tokenizer=tokenizer,max_input_len=40,max_output_len=40)
train_loader = DataLoader(train_dataset,shuffle=True,batch_size=32)



20
3
22
3
20
3
22
3
21
3


In [16]:
NUM_EPOCHES = 8
with open("./log(lecturebank-nonlabel).txt",'w',encoding='utf-8') as f:
    for epoch in range(NUM_EPOCHES):
        train(t5_model,DEVICE,train_loader,optimizer)
        precision,recall,F1 = val(t5_model,tokenizer,DEVICE,["./LectureBank/CV/val.0.csv","./LectureBank/CV/val.1.csv","./LectureBank/CV/val.2.csv","./LectureBank/CV/val.3.csv","./LectureBank/CV/val.4.csv"])
        f.write("EPOCH:" + str(epoch) + "\n")
        f.write("precision：" + str(precision) + "\n")
        f.write("recall：" + str(recall) + "\n")
        f.write("F1：" + str(F1) + "\n")

  0%|          | 0/232 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 232/232 [00:35<00:00,  6.54it/s, loss=0.0181]


TypeError: expected str, bytes or os.PathLike object, not list

In [20]:
precision,recall,F1 = val(t5_model,tokenizer,DEVICE,["./LectureBank/CV/val.0.csv","./LectureBank/CV/val.1.csv","./LectureBank/CV/val.2.csv","./LectureBank/CV/val.3.csv","./LectureBank/CV/val.4.csv"])
f.write("EPOCH:" + str(epoch) + "\n")
f.write("precision：" + str(precision) + "\n")
f.write("recall：" + str(recall) + "\n")
f.write("F1：" + str(F1) + "\n")

KeyError: '118'

In [None]:
# input_ids = tokenizer(t, return_tensors='pt').input_ids.to(DEVICE)
# outputs = t5_model.generate(input_ids)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
# result = tokenizer.decode(outputs[0], skip_special_tokens=True)
# result.split(" - ")

In [None]:
# print(val(t5_model,tokenizer,DEVICE,"./semeval/semeval_val.txt"))

In [None]:
# acc_rel,acc_entity,acc_all = val(t5_model,tokenizer,DEVICE,"./LectureBank/val.0.csv")

In [None]:
# acc_rel

In [None]:
# acc_all