### 必要なライブラリだけ選択してimport

In [1]:
import torch
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

### これ以降にコードを書く

In [3]:
a = torch.arange(3*3*3).reshape(3,3,3)
a

tensor([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8]],

        [[ 9, 10, 11],
         [12, 13, 14],
         [15, 16, 17]],

        [[18, 19, 20],
         [21, 22, 23],
         [24, 25, 26]]])

In [4]:
a[:,[1,2]]

tensor([[[ 3,  4,  5],
         [ 6,  7,  8]],

        [[12, 13, 14],
         [15, 16, 17]],

        [[21, 22, 23],
         [24, 25, 26]]])

In [7]:
a[[0,1,2],[1,2,0]]

tensor([[ 3,  4,  5],
        [15, 16, 17],
        [18, 19, 20]])

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split


"""
Data Split
"""
def split2ttv(df):
    sent_groups = df.groupby('sentenceID')
    group_labels = list(sent_groups.groups.keys())
    train_idx, test_valid_idx = train_test_split(group_labels, test_size=0.2, random_state=0)
    test_idx, valid_idx = train_test_split(test_valid_idx, test_size=0.5, random_state=0)
    
    # グループのインデックスを使用してデータフレームを再構築
    train_df = pd.concat([sent_groups.get_group(x) for x in train_idx])
    test_df = pd.concat([sent_groups.get_group(x) for x in test_idx])
    valid_df = pd.concat([sent_groups.get_group(x) for x in valid_idx])

    print(f"Data num. Train:{len(train_df)}, Test:{len(test_df)}, Valid:{len(valid_df)}\n")
    return train_df, test_df, valid_df


"""
Dataset
"""
class BertTokenizer():
    def __init__(self, pretrained_model):
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
    
    def tokenize(self, wakati, padding_num, max_token):
        # idに変換
        token_num = len(wakati.split(' '))
        assert token_num <= max_token-1, print('Token num should be under 255.')
        ids = np.array(self.tokenizer.convert_tokens_to_ids(['[CLS]'] + wakati.split(' ') + ['[PAD]']*padding_num))
        
        # マスク作成
        attention_mask = np.array([1]*(token_num+padding_num+1))
        attention_mask[1+token_num:] = 0
        
        return ids, attention_mask


def span2seq(spans, token_len, padding_num, lab2id):
    seq = ['N'] * token_len
    
    for i, j, arg in spans:
        i, j = int(i), int(j)
        seq[i:j+1] = [arg]*(j-i+1)
    seq += ['PAD']*padding_num
    #print(seq)
    ids_seq = [lab2id[s] for s in seq]
    return ids_seq


def mk_patial_labels(token_num, pred, orders, lab2id, longest_token_num):
    partial_srl_labels_for_all_iteration = []
    spans = [(pred['word_start'], pred['word_end'], 'V')]
    
    # init
    partial_srl_labels = span2seq(spans, token_num, longest_token_num - token_num, lab2id)
    partial_srl_labels_for_all_iteration.append(partial_srl_labels)

    # それ以降
    for arg_info in orders:
        spans.append(arg_info)
        partial_srl_labels = span2seq(spans, token_num, longest_token_num - token_num, lab2id)
        partial_srl_labels_for_all_iteration.append(partial_srl_labels)
    return partial_srl_labels_for_all_iteration


def mk_target_labels(orders, lab2id, max_token):
    target_start_label, target_end_label, target_srl_label = [], [], []
    for start, end, label in orders:
        target_start_label.append(start)
        target_end_label.append(end)
        target_srl_label.append(lab2id[label])
    target_start_label.append(max_token-1)    # Null 位置はouputの最終次元とする．
    target_end_label.append(-1)             # dummy. -1 だともしもの時エラーが起こせる？
    target_srl_label.append(-1)             # dummy
    return target_start_label, target_end_label, target_srl_label


def mk_arg_indication(orders, lab2id):
    arg_indication = [0]*(len(lab2id)-3)
    for s, e, arg in orders:
        arg_indication[lab2id[arg]] = 1
    
    return arg_indication


def mapping(set, bert_tokenizer, max_token, lab2id):
    sentences, partial_srl_labels, attention_masks, arg_indications = [], [], [], []             # 入力ベクトル等
    target_start_labels, target_end_labels, target_srl_labels = [], [], []  # 正解ラベル
    longest_token_num = max(set['num_of_tokens'])
    for sent, pred, token_num, orders in zip(set['sentence'], set['predicate'], set['num_of_tokens'], set['label_order_to_give']):
        ids, attention_mask = bert_tokenizer.tokenize(sent, longest_token_num - token_num, max_token)
        sentences.append(ids)
        attention_masks.append(attention_mask)
        partial_srl_labels.append(mk_patial_labels(token_num, pred, orders, lab2id, longest_token_num))
        arg_indications.append(mk_arg_indication(orders, lab2id))
        target_SL, target_EL, target_SRLL = mk_target_labels(orders, lab2id, max_token)
        target_start_labels.append(target_SL), target_end_labels.append(target_EL), target_srl_labels.append(target_SRLL)
        
    sentences, attention_masks = torch.tensor(sentences, dtype=torch.long), torch.tensor(attention_masks, dtype=torch.long)
    # iterが先に来るように変形
    partial_srl_labels = torch.tensor(partial_srl_labels, dtype=torch.long).permute(1,0,2)
    target_start_labels = torch.tensor(target_start_labels, dtype=torch.long).permute(1,0)
    target_end_labels = torch.tensor(target_end_labels, dtype=torch.long).permute(1,0)
    target_srl_labels = torch.tensor(target_srl_labels, dtype=torch.long).permute(1,0)
    arg_indications = torch.tensor(arg_indications)
    
    return [sentences, attention_masks, arg_indications, partial_srl_labels, target_start_labels, target_end_labels, target_srl_labels]


def mk_dataset(df, batch_size, pretrained_model, max_token, lab2id):
    bert_tokenizer = BertTokenizer(pretrained_model)
   
    #意味役割の数でグループを作成
    df['num_of_tokens'] = df['sentence'].map(lambda x: len(x.split(' ')))
    df['labels_per_a_pred'] = df['args'].map(lambda x: len(x))
    df['label_order_to_give'] = df['args'].map(lambda x: sorted([(e['word_start'], e['word_end'], e['argrole']) for e in x], key = lambda k: np.random.random()))
    df = df.sort_values(by='labels_per_a_pred')
    groups = df.groupby('labels_per_a_pred')
    group_labels = groups.groups.keys()
    
    # 意味役割の数毎にミニバッチを作成し統合
    batch_set = []
    for i, group_label in enumerate(group_labels):
        #print(f"Step {i+1}/{len(group_labels)}")
        group_df = groups.get_group(group_label)  # グループデータを一度だけ取得
        group_size = len(group_df)
        batch_indices = range(0, group_size, batch_size)  # バッチごとの開始インデックスを計算
        #print(group_df.iloc[0,:])
        batch_set += [group_df.iloc[start_idx:start_idx+batch_size] for start_idx in batch_indices]

    #return batch_set
    batch_set = [mapping(set, bert_tokenizer, max_token, lab2id) for set in batch_set]
    return batch_set, groups

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [9]:
DATAPATH = 'Data/common_data_v2_bert.json'
PRETRAINED_MODEL = "cl-tohoku/bert-base-japanese-v2"

with open(DATAPATH, 'r', encoding="utf-8_sig") as json_file:
    df = pd.read_json(json_file)

# ラベル辞書作成
arg_list=[]
for args in df['args']:
    for arg in args: 
        arg_list.append(arg['argrole'])
labels = set(arg_list+['N', 'V', 'PAD'])
labels = sorted(list(labels))
lab2id = dict( list(zip(labels, range(len(labels)))) )
id2lab = {v:k for k, v in lab2id.items()}
#print(lab2id)

# 各種データ作成（学習，テスト，検証
train_df, test_df, valid_df = split2ttv(df)
print(f"Data num. Train:{len(train_df)}, Test:{len(test_df)}, Valid:{len(valid_df)}\n")

# テスト時のデータ削減（本実験ではコメントアウト

""" Dataset """
valid_dataset, _ = mk_dataset(valid_df, 1, PRETRAINED_MODEL, 256, lab2id)

Data num. Train:41938, Test:5331, Valid:5259

Data num. Train:41938, Test:5331, Valid:5259



In [11]:
test_df[:3]

Unnamed: 0,sentenceID,abs_id,sentence,pos_sentece,predicate,args
23408,64_essay_manabi50_Aoki-2009winter,588745,将来 、 私 たち の 研究 が 着 ##床 を 改善 する 薬 、 毛 ##根 の 働き ...,,"{'frameID': 356, 'verb': '', 'char_start': 25,...","[{'argrole': 'Arg1', 'semrole': '対象', 'char_st..."
23409,64_essay_manabi50_Aoki-2009winter,588746,将来 、 私 たち の 研究 が 着 ##床 を 改善 する 薬 、 毛 ##根 の 働き ...,,"{'frameID': 483, 'verb': '', 'char_start': 48,...","[{'argrole': 'Arg1', 'semrole': '対象（動作）', 'cha..."
23410,64_essay_manabi50_Aoki-2009winter,588747,将来 、 私 たち の 研究 が 着 ##床 を 改善 する 薬 、 毛 ##根 の 働き ...,,"{'frameID': 613, 'verb': '', 'char_start': 76,...","[{'argrole': 'Arg2', 'semrole': '原因', 'char_st..."


In [21]:
test_df.groupby('sentenceID').groups
test_df.groupby('sentenceID').get_group('100_aozora_Kajii-1925')
test_df.groupby('sentenceID')['abs_id'].apply(list)
test_df.groupby('sentenceID')['abs_id'].apply(list).to_dict()

{'100_aozora_Kajii-1925': [2050058],
 '100_diet_kaigiroku-16': [815193, 815194],
 '100_news_KAHOKU_65': [473946],
 '101_news_KAHOKU_35': [1852094, 1852095],
 '101_ted_talk_5': [842038],
 '102_aozora_Doyle-1905': [297182, 297183, 297184, 297185, 297186],
 '102_aozora_Hayashida-2015': [858939],
 '102_dict_pth_s': [546468],
 '102_fiction_kotobawokawasenaikimini_arigatoo': [1447758],
 '102_news_KAHOKU_34': [469023, 469024, 469025],
 '102_ted_talk_5': [842063, 842064],
 '103_aozora_Hayashida-2015': [858991],
 '103_dict_pth_d': [422345],
 '103_diet_kaigiroku-16': [815247, 815248],
 '103_news_KAHOKU_46': [899676],
 '103_news_KAHOKU_55': [905260, 905261],
 '103_ted_talk_5': [842117, 842120],
 '103_ted_talk_8': [1491618, 1491619],
 '103_whitepaper_H26_kodomowakamono1': [871627],
 '104_aozora_Edogawa-1929': [1031337, 1031338, 1031339],
 '104_dict_pth_q': [1915889],
 '104_news_KAHOKU_36': [1566791, 1566792, 1566793, 1566794],
 '104_news_KAHOKU_69': [810961, 810962],
 '104_wikipedia_KYOTO_9': [632

In [34]:
a=torch.arange(12, dtype=torch.float).reshape(3,4)
a

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.]])

In [36]:
torch.tensor([[1,2,3], [1,2]])

ValueError: expected sequence of length 3 at dim 1 (got 2)

In [37]:
np.random.randint(0, 100, size=100)

array([45,  6, 59, 94, 92,  4, 18, 36, 48, 65, 87, 16, 93, 19, 66, 64, 13,
       72, 92, 11, 47, 24, 29, 99, 36, 18, 87, 31,  6, 72, 55, 93, 45, 41,
       63, 68, 57, 98, 70, 75, 27, 47, 92, 62, 16, 22, 70,  1,  7, 48, 60,
       87, 90, 30, 85,  0, 78, 87, 30, 34, 60, 67, 74, 79, 69, 67, 91, 95,
       86, 47, 75, 75, 32, 31, 65, 36, 93, 62, 46, 92, 33, 98, 84, 34,  6,
       54,  5, 82, 94, 72,  0, 55, 96, 99, 52, 11, 17, 16, 71, 31])

In [8]:
for a, b in [[1,2,3]]:
    print(a,b)

ValueError: too many values to unpack (expected 2)