In [62]:
from utils import load,save
import pandas as pd
import numpy as np
from utils import MDFeat
from typing import List
from transformers import pipeline,AutoModelForTokenClassification,AutoTokenizer
from tqdm import tqdm
from dataclasses import dataclass
import random
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F

### CLoss

In [56]:

class SupConLoss(nn.Module):
    def __init__(self, temperature=0.07, contrast_mode='all', base_temperature=0.07, device=torch.device('cpu')):
        """
        :param temperature:  t
        :param contrast_mode:
        :param base_temperature:
        """
        super(SupConLoss, self).__init__()
        self.device = device
        self.temperature = temperature
        self.contrast_mode = contrast_mode
        self.base_temperature = base_temperature

    def forward(self, features, labels=None, mask=None):
        """
        both `labels` and `mask` are None, it degenerates to SimCLR unsupervised loss
        :param features:  B^alpha 中的2个augment sentence拼接而成,[Batch_size,2*S_L,d_model] 三维
        :return:
        """
        batch_size = features.shape[0]
        contrast_count = features.shape[1]
        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)  # (Batch_size+2*S_L,)

        if self.contrast_mode == 'one':
            anchor_feature = features[:, 0]
            anchor_count = 1
        elif self.contrast_mode == 'all':
            anchor_feature = contrast_feature
            anchor_count = contrast_count
        else:
            raise ValueError('Unknown mode: {}'.format(self.contrast_mode))

        if labels is not None and mask is not None:
            raise ValueError('Cannot define both `labels` and `mask`')
        elif labels is None and mask is None:
            mask = torch.eye(batch_size, dtype=torch.float32).to(self.device)
        elif labels is not None:
            labels = labels.contiguous().view(-1, 1)
            if labels.shape[0] != batch_size:
                raise ValueError('Num of labels does not match num of features')
            mask = torch.eq(labels, labels.T).float().to(self.device)
        else:
            print(mask)
            mask = mask.float().to(self.device)

        anchor_dot_contrast = torch.div(
            torch.matmul(anchor_feature, contrast_feature.T),
            self.temperature)

        # for numerical stability
        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
        logits = anchor_dot_contrast - logits_max.detach()

        # tile mask
        mask = mask.repeat(anchor_count, contrast_count)
        # mask-out self-contrast cases
        logits_mask = torch.scatter(
            torch.ones_like(mask),
            1,
            torch.arange(batch_size * anchor_count).view(-1, 1).to(self.device),
            0
        )
        mask = mask * logits_mask

        # compute log_prob
        exp_logits = torch.exp(logits) * logits_mask
        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))

        # compute mean of log-likelihood over positive
        mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)

        # loss
        loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos
        # loss = - mean_log_prob_pos
        loss = loss.view(anchor_count, batch_size).mean()

        return loss


In [77]:
CLoss = SupConLoss(temperature=1,contrast_mode='one',base_temperature=1)

In [78]:
x = torch.FloatTensor(torch.rand([3,10])).reshape(1,-1)
y = torch.FloatTensor(torch.rand([3,10])).reshape(1,-1)
# [Batch_size,2*S_L,d_model]
feat1 = F.normalize(x, dim=1)  # head embed_size->128 线性层
feat2 = F.normalize(y, dim=1)
features = torch.cat([feat1.unsqueeze(1), feat2.unsqueeze(1)], dim=1)

In [79]:
CLoss(features)

tensor(-1.4901e-08)

In [110]:
x = torch.FloatTensor(torch.rand([4,10]))
y = torch.FloatTensor(torch.rand([4,10]))
 
similarity = torch.cosine_similarity(x, x, dim=1)
 
loss = 1 - similarity


In [112]:
1-loss

tensor([1., 1., 1., 1.])

### MLM

In [47]:
data = load("/home/tywang/MD/data/VUA/vua_train.pkl")

In [16]:
len(data[88].addinfo1)

9

### multi-fold 数据

In [6]:
from dataclasses import dataclass
import pickle 
def load(path_name: object) -> object:
    with open(path_name, 'rb') as file:
        return pickle.load(file)
@dataclass
class MDFeat:
    sentence: str
    verb_idx: int
    verb:str
    label: int


In [7]:
vua_pos = load("/home/tywang/MD/data/VUA_ALL_POS/vua_pos_train.pkl")

In [8]:
vua_pos

AttributeError: 'MDFeat' object has no attribute 'addinfo1'

In [14]:
data:List[MDFeat] = load("/home/tywang/MD/data/VUA/vua_train.pkl")

In [15]:
data[0]

MDFeat(sentence="Ca n't fail to be entertaining .", verb_idx=2, verb='fail', label=0, addinfo1=["Ca n't fails to be entertaining .", "Ca n't failure to be entertaining .", "Ca n't failed to be entertaining .", "Ca n't succeed to be entertaining .", "Ca n't failing to be entertaining .", "Ca n't try to be entertaining .", "Ca n't attempt to be entertaining .", "Ca n't have to be entertaining .", "Ca n't cease to be entertaining ."], addinfo2=None)

In [11]:
def get_mask_sentence(unmasker,data:MDFeat):
    unmasked_texts = []
    sentence_template = data.sentence.split()
    ori_token = sentence_template[data.verb_idx]
    sentence_template[data.verb_idx] = '{}'
    sentence_template = " ".join(sentence_template)
    input_sentence = sentence_template + " " + data.sentence
    input_sentence = input_sentence.format("[MASK]")
    unmask_result = unmasker(input_sentence)
    for result in unmask_result:
        if result['token_str'].find():
            unmasked_texts.append(sentence_template.format("<VERB> "+result['token_str']+" </VERB>"))
    return unmasked_texts

In [17]:
data[0]

MDFeat(sentence="Ca n't fail to be entertaining .", verb_idx=2, verb='fail', label=0, addinfo1=["Ca n't <VERB> fails </VERB> to be entertaining .", "Ca n't <VERB> failure </VERB> to be entertaining .", "Ca n't <VERB> failed </VERB> to be entertaining .", "Ca n't <VERB> succeed </VERB> to be entertaining .", "Ca n't <VERB> failing </VERB> to be entertaining .", "Ca n't <VERB> try </VERB> to be entertaining .", "Ca n't <VERB> attempt </VERB> to be entertaining .", "Ca n't <VERB> have </VERB> to be entertaining .", "Ca n't <VERB> cease </VERB> to be entertaining ."], addinfo2=None)

In [116]:
def get_normal_mask(unmasker,data:MDFeat):
    normal_texts = []
    splitted_sentence = data.sentence.split()
    # 随机选择一个位置
    pausible_index = [i for i in range(len(splitted_sentence)) if i!=data.verb_idx]
    random.shuffle(pausible_index)
    for normal_index in pausible_index[:3]:
        # 随机替换该位置为mask
        count = 0
        splitted_sentence = data.sentence.split()
        ori_word = splitted_sentence[normal_index]
        splitted_sentence[normal_index] = ' {} '
        sentence_template = ' '.join(splitted_sentence)
        # print(sentence_template)
        masked_sentence = sentence_template.format("[MASK]")
        # 生成mask token
        unmask_result = unmasker(masked_sentence)
        for result in unmask_result:
            if count==2:break
            if result['token_str']!=ori_word.strip().lower():
                # print(result['token_str'])
                normal_texts.append(sentence_template.format(result['token_str']))
                count+=1
    final_sentence = []
    for t in normal_texts:
        sentence_split = t.split()
        sentence_split[data.verb_idx] = " <VERB> {} </VERB> ".format(sentence_split[data.verb_idx])
        t = " ".join(sentence_split)
        final_sentence.append(t)
    return final_sentence

In [19]:
unmasker = pipeline('fill-mask', model='/data/transformers/bert-base-uncased',device = 1,top_k =10)

Some weights of the model checkpoint at /data/transformers/bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [48]:
for data_i in tqdm(data):
    data_i.addinfo2 = get_normal_mask(unmasker,data_i)

100%|██████████| 15516/15516 [11:38<00:00, 22.22it/s]


In [124]:
index =[]
for id,data_i in enumerate(data):
    if len(data_i.addinfo2)==0:
        index.append(id)

In [117]:
for id in index:
    data[id].addinfo2 = get_normal_mask(unmasker,data[id])



In [125]:
index

[]

In [123]:
for id in index:
    data[id].addinfo2 = " <VERB> {} </VERB> ".format(data[id].sentence)

In [127]:
save(data,"/home/tywang/MD/data/VUA/vua_train_mlm.pkl")

save to(保存到)  /home/tywang/MD/data/VUA/vua_train_mlm.pkl


In [17]:
for i in tqdm(range(len(data))):
    data[i].addinfo1 = get_mask_sentence(unmasker,data[i])

100%|██████████| 15516/15516 [03:55<00:00, 65.78it/s]


In [126]:
save(data,"/home/tywang/MD/data/VUA/vua_train.pkl")

save to(保存到)  /home/tywang/MD/data/VUA/vua_train.pkl


In [20]:
data[0]

MDFeat(sentence="Ca n't fail to be entertaining .", verb_idx=2, verb='fail', label=0, addinfo1=["Ca n't <VERB> fails </VERB> to be entertaining .", "Ca n't <VERB> failure </VERB> to be entertaining .", "Ca n't <VERB> failed </VERB> to be entertaining .", "Ca n't <VERB> succeed </VERB> to be entertaining .", "Ca n't <VERB> failing </VERB> to be entertaining .", "Ca n't <VERB> try </VERB> to be entertaining .", "Ca n't <VERB> attempt </VERB> to be entertaining .", "Ca n't <VERB> have </VERB> to be entertaining .", "Ca n't <VERB> cease </VERB> to be entertaining ."], addinfo2=None)

In [13]:
def get_mask_sentence(unmasker,text_arr:list):
    """
        输入一个句子arr, 得到它的预测token以及预测的score
    """
    unmask_result = unmasker(text_arr)
    return_results = []
    for result in unmask_result:
        return_result = []
        for idx,token in enumerate(result):
            return_result.append((token['score'],token['token_str']))
        return_results.append(return_result)
    return return_results
    

In [14]:
get_mask_sentence(unmasker,["Hello I'm a [MASK] model.","Hello I'm a [MASK] model."])

[[(0.10721077024936676, 'fashion'),
  (0.08796188235282898, 'role'),
  (0.05333735793828964, 'new'),
  (0.046707648783922195, 'super'),
  (0.027107620611786842, 'fine'),
  (0.02393934689462185, 'good'),
  (0.021875906735658646, 'model'),
  (0.0213792622089386, 'great'),
  (0.01942354626953602, 'business'),
  (0.016654005274176598, 'fitness')],
 [(0.10721077024936676, 'fashion'),
  (0.08796188235282898, 'role'),
  (0.05333735793828964, 'new'),
  (0.046707648783922195, 'super'),
  (0.027107620611786842, 'fine'),
  (0.02393934689462185, 'good'),
  (0.021875906735658646, 'model'),
  (0.0213792622089386, 'great'),
  (0.01942354626953602, 'business'),
  (0.016654005274176598, 'fitness')]]

### read 10 fold

In [23]:
df = pd.read_csv("/home/tywang/MD/data/MOH-X/MOH-X_formatted_svo_cleaned.csv", encoding = "ISO-8859-1",low_memory=False)
df.head()

Unnamed: 0,arg1,arg2,verb,sentence,verb_idx,label
0,knowledge,,absorb,He absorbed the knowledge or beliefs of his t...,1,1
1,cost,,absorb,He absorbed the costs for the accident .,1,1
2,tax,,absorb,The sales tax is absorbed into the state inco...,4,1
3,immigrant,,absorb,The immigrants were quickly absorbed into soc...,4,1
4,interest,,absorb,Her interest in butterflies absorbs her compl...,4,1


In [24]:
sentence = df['sentence'].values
verb_idx = df['verb_idx'].values
verb = df['verb'].values
label = df['label'].values
senetnce = [item.strip() for item in sentence]

In [25]:
mohx = list()
for text, verb_id, v, gt in zip(senetnce,verb_idx,verb,label):
    feat = MDFeat(
        sentence=text,
        verb_idx = verb_id,
        verb = v,
        label = gt,
        addinfo1=None,
        addinfo2=None
    )
    mohx.append(feat)

In [26]:
print(len(mohx ))

647


In [27]:

n_ten = int(0.1*len(mohx))
for i in range(10):
    select_range = [i*n_ten,(1+i)*n_ten]
    mohx_1 = mohx[select_range[0]:select_range[1]]
    mohx_9 = mohx[:select_range[0]]+mohx[select_range[1]:]
    print(select_range)
    print(len(mohx_1))
    print(len(mohx_9))
    print(len(mohx_1)+len(mohx_9))
    save(mohx_1,"/home/tywang/MD/data/MOH-X/mohx{}_val.pkl".format(i))
    save(mohx_9,"/home/tywang/MD/data/MOH-X/mohx{}_train.pkl".format(i))

[0, 64]
64
583
647
save to(保存到)  /home/tywang/MD/data/MOH-X/mohx0_val.pkl
save to(保存到)  /home/tywang/MD/data/MOH-X/mohx0_train.pkl
[64, 128]
64
583
647
save to(保存到)  /home/tywang/MD/data/MOH-X/mohx1_val.pkl
save to(保存到)  /home/tywang/MD/data/MOH-X/mohx1_train.pkl
[128, 192]
64
583
647
save to(保存到)  /home/tywang/MD/data/MOH-X/mohx2_val.pkl
save to(保存到)  /home/tywang/MD/data/MOH-X/mohx2_train.pkl
[192, 256]
64
583
647
save to(保存到)  /home/tywang/MD/data/MOH-X/mohx3_val.pkl
save to(保存到)  /home/tywang/MD/data/MOH-X/mohx3_train.pkl
[256, 320]
64
583
647
save to(保存到)  /home/tywang/MD/data/MOH-X/mohx4_val.pkl
save to(保存到)  /home/tywang/MD/data/MOH-X/mohx4_train.pkl
[320, 384]
64
583
647
save to(保存到)  /home/tywang/MD/data/MOH-X/mohx5_val.pkl
save to(保存到)  /home/tywang/MD/data/MOH-X/mohx5_train.pkl
[384, 448]
64
583
647
save to(保存到)  /home/tywang/MD/data/MOH-X/mohx6_val.pkl
save to(保存到)  /home/tywang/MD/data/MOH-X/mohx6_train.pkl
[448, 512]
64
583
647
save to(保存到)  /home/tywang/MD/data/MOH-X/moh

### read scv

In [26]:
mode = "dev"
df = pd.read_csv("/home/tywang/MD/data/VUA_ALL_POS/VUA_ALL_POS_{}.csv".format(mode), encoding = "ISO-8859-1",low_memory=False)

In [27]:
df.head()

Unnamed: 0,text_idx,sentence_idx,word,sentence,word_idx,label,pos,genre
0,acj-fragment01,148,Four,Four alternative approaches have been describe...,0,0,NUM,academic
1,acj-fragment01,148,alternative,Four alternative approaches have been describe...,1,0,ADJ,academic
2,acj-fragment01,148,approaches,Four alternative approaches have been describe...,2,1,NOUN,academic
3,acj-fragment01,148,have,Four alternative approaches have been describe...,3,0,VERB,academic
4,acj-fragment01,148,been,Four alternative approaches have been describe...,4,0,VERB,academic


In [28]:
sentence = df['sentence'].values
verb_idx = df['word_idx'].values
verb = df['word'].values
label = df['label'].values
senetnce = [item.strip() for item in sentence]

In [29]:
mohx = list()
for text, verb_id, v, gt in zip(senetnce,verb_idx,verb,label):
    feat = MDFeat(
        sentence=text,
        verb_idx = verb_id,
        verb = v,
        label = gt,
        addinfo1=None,
        addinfo2=None
    )
    mohx.append(feat)

In [30]:
mohx[-5:],len(mohx)

([MDFeat(sentence='The section describing the operation of the system should cover not only how the system operates but also how the user can operate it .', verb_idx=20, verb='user', label=0, addinfo1=None, addinfo2=None),
  MDFeat(sentence='The section describing the operation of the system should cover not only how the system operates but also how the user can operate it .', verb_idx=21, verb='can', label=0, addinfo1=None, addinfo2=None),
  MDFeat(sentence='The section describing the operation of the system should cover not only how the system operates but also how the user can operate it .', verb_idx=22, verb='operate', label=0, addinfo1=None, addinfo2=None),
  MDFeat(sentence='The section describing the operation of the system should cover not only how the system operates but also how the user can operate it .', verb_idx=23, verb='it', label=0, addinfo1=None, addinfo2=None),
  MDFeat(sentence='The section describing the operation of the system should cover not only how the system o

In [31]:
len(mohx)

38628

In [32]:
save(mohx,"/home/tywang/MD/data/VUA_ALL_POS/vua_pos_{}.pkl".format(mode))

save to(保存到)  /home/tywang/MD/data/VUA_ALL_POS/vua_pos_dev.pkl
