In [2]:
import pandas as pd
from transformers import BertTokenizer
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_df = pd.read_json('data/training-sentences.json', orient='records')
train_df.set_index('id', inplace=True)
train_df['has_cantonese'] = False
test_df = pd.read_json('data/testing-data.json', orient='records')
test_df.set_index('id', inplace=True)
test_df['has_cantonese'] = False

In [4]:
train_labels = pd.read_json('data/training-corrections.json', orient='records')
test_labels = pd.read_json('data/testing-gold.json', orient='records')

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [6]:
for index, row in train_labels.iterrows():
    sentence = train_df.at[row['id'], 'sentence']
    tokens = tokenizer.tokenize(sentence)
    train_df.at[row['id'], 'tokens'] = str(tokens)
    labels = ['0'] * len(tokens)
    if row['cantonese'] != []:
        train_df.at[row['id'], 'has_cantonese'] = True
    for correction in row['cantonese']:
        position = correction['position'] - 1
        length = correction['length']
        labels[position:position + length] = ['1'] * length
    train_df.at[row['id'], 'labels'] = str(labels)
for index, row in test_labels.iterrows():
    sentence = test_df.at[row['id'], 'sentence']
    tokens = tokenizer.tokenize(sentence)
    test_df.at[row['id'], 'tokens'] = str(tokens)
    labels = ['0'] * len(tokens)
    if row['cantonese'] != []:
        test_df.at[row['id'], 'has_cantonese'] = True
    for correction in row['cantonese']:
        position = correction['position'] - 1
        length = correction['length'] if 'length' in correction else 1
        labels[position:position + length] = ['1'] * length
    test_df.at[row['id'], 'labels'] = str(labels)

In [7]:
train_df

Unnamed: 0_level_0,sentence,has_cantonese,tokens,labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ASTRI0000,仍記得小學下課的時候，我總愛跟表弟到草推裏捉蠶蟲，每當打開葉子時看到蟲子，心中總有無限的喜悅。,False,"['仍', '記', '得', '小', '學', '下', '課', '的', '時', ...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."
ASTRI0001,我獨自站在火山的腳下，望著天，靜候著山上的紅川湧流將我掩沒的一刻。,False,"['我', '獨', '自', '站', '在', '火', '山', '的', '腳', ...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."
ASTRI0002,來到百生的學校，我並不害怕，反而覺得這學校比之前的學校大，感覺很好。,False,"['來', '到', '百', '生', '的', '學', '校', '，', '我', ...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."
ASTRI0003,今天是平安夜，街道上火樹銀花、燈火通明；汽車熙來攘往、如即如梭。,False,"['今', '天', '是', '平', '安', '夜', '，', '街', '道', ...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."
ASTRI0004,「借問酒家何處有，牧童遙指杏花村。」然而在現在，這樣的事情畫意又從何而尋呢。,False,"['「', '借', '問', '酒', '家', '何', '處', '有', '，', ...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."
...,...,...,...,...
ASTRI0995,社會上也經常發生性別認知差異，而需要「打官司」的事件。,True,"['社', '會', '上', '也', '經', '常', '發', '生', '性', ...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."
ASTRI0996,我的心跳得很快，血管快抵受不了脈搏的衝激，手控制不了地顫抖，喉嚨乾啞了。,False,"['我', '的', '心', '跳', '得', '很', '快', '，', '血', ...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."
ASTRI0997,還記得五年班時，天真無知的我初次在學校的音樂課學習吹牧童笛，覺得自己頗有天份，便和幾個同學參...,True,"['還', '記', '得', '五', '年', '班', '時', '，', '天', ...","['0', '0', '0', '0', '0', '1', '0', '0', '0', ..."
ASTRI0998,來到課室，許多同學早已在等候老師，我一來，大家都望著我，我倒不害羞，不荒不亂地慢慢走進去。,False,"['來', '到', '課', '室', '，', '許', '多', '同', '學', ...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."


In [8]:
import numpy as np
additional_df = pd.read_csv('data/nlptea-additional.csv', index_col=0)
for index, row in additional_df.iterrows():
    tokens = tokenizer.tokenize(row['sentence'])
    additional_df.at[index, 'tokens'] = str(tokens)
    labels = ['0'] * len(tokens)
    position_1 = row['start_1']
    length_1 = row['length_1']
    labels[position_1:position_1 + length_1] = ['1'] * length_1
    print(f'Span: {tokens[position_1:position_1 + length_1]}')
    position_2 = row['start_2']
    length_2 = row['length_2']
    # check if empty
    if not pd.isna(position_2) and not pd.isna(length_2):
        position_2 = int(position_2)
        length_2 = int(length_2)
        labels[position_2:position_2 + length_2] = ['1'] * length_2
        print(f'Span: {tokens[position_2:position_2 + length_2]}')
    additional_df.at[index, 'labels'] = str(labels)
additional_df.drop(['start_1', 'length_1', 'start_2', 'length_2'], axis=1, inplace=True)

Span: ['點', '解']
Span: ['呢']
Span: ['同']
Span: ['同']
Span: ['得', '滯']
Span: ['係']
Span: ['哋']
Span: ['啲', '乜']
Span: ['喺', '度']
Span: ['嚟']
Span: ['咁']
Span: ['著', '衫']
Span: ['睇']
Span: ['似']
Span: ['[UNK]']
Span: ['令']
Span: ['行']
Span: ['畀']
Span: ['鬧']
Span: ['咁', '串', '嘴']
Span: ['到']
Span: ['成', '日']
Span: ['忍', '唔', '住']
Span: ['呢', '個']
Span: ['住']
Span: ['講']
Span: ['畀', '你']
Span: ['冇']
Span: ['咗']
Span: ['搞']
Span: ['約', '埋']
Span: ['睇']
Span: ['喺']
Span: ['[UNK]']
Span: ['就', '嚟']
Span: ['地', '皮']
Span: ['定', '係']
Span: ['比']
Span: ['住', '啲', '咪', '嘅']
Span: ['識', '得']
Span: ['咁']
Span: ['咗']
Span: ['嗰']
Span: ['呢']
Span: ['勁']
Span: ['話']
Span: ['唔', '使']
Span: ['個']
Span: ['係', '咪']
Span: ['冇']
Span: ['好']
Span: ['靚', '到', '暈']
Span: ['醒', '目']
Span: ['兜', '圈']
Span: ['喺']
Span: ['搵']
Span: ['乘', '搭']
Span: ['冇', '乜', '嘢', '咁']
Span: ['嘅']
Span: ['鬼', '佬']
Span: ['但', '係']
Span: ['好', '彩']
Span: ['而', '家']
Span: ['落', '緊']
Span: ['心', '淡']
Span: ['先', '至']
Span: ['呢']
Spa

In [9]:
additional_df

Unnamed: 0_level_0,sentence,has_cantonese,tokens,labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ADD0,人類對舒適生活的追求確實推動了社會的進步，但大家曾否有著疑惑：點解我們在努力改善客觀生存環境...,True,"['人', '類', '對', '舒', '適', '生', '活', '的', '追', ...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."
ADD1,呢個週末，我們全家去渡假村泡溫泉。,True,"['呢', '個', '週', '末', '，', '我', '們', '全', '家', ...","['1', '0', '0', '0', '0', '0', '0', '0', '0', ..."
ADD2,《春在綠蕪中》共包括了十三篇短篇散文，記述了作者同親戚或朋友的生活，為作者的人生帶來不同的影...,True,"['《', '春', '在', '綠', '蕪', '中', '》', '共', '包', ...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."
ADD3,美玲偷懶得滯，結果考試不合格，真係得不嘗失啊！,True,"['美', '玲', '偷', '懶', '得', '滯', '，', '結', '果', ...","['0', '0', '0', '0', '1', '1', '0', '0', '0', ..."
ADD4,通過互聯網，我哋能暸解到世界上發生啲乜大小事情。,True,"['通', '過', '互', '聯', '網', '，', '我', '哋', '能', ...","['0', '0', '0', '0', '0', '0', '0', '1', '0', ..."
...,...,...,...,...
ADD57,死亡是生存的對頭人，人們為了抵抗死亡，頑強地死撐下去；但人們始終敵不過命運的擺弄，乖乖而又無...,True,"['死', '亡', '是', '生', '存', '的', '對', '頭', '人', ...","['0', '0', '0', '0', '0', '0', '1', '1', '1', ..."
ADD58,雨下過就停，雲散了就不會再埋堆，相同地，時間過了就不會再返來。,True,"['雨', '下', '過', '就', '停', '，', '雲', '散', '了', ...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."
ADD59,從小到大我都生長於一個愉快、食飽無憂米的環境，屋企人都相處融洽。,True,"['從', '小', '到', '大', '我', '都', '生', '長', '於', ...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."
ADD60,小時候我已經經常聽聞話男生發育比較晚，所以小學階段的女生成績比較好。,True,"['小', '時', '候', '我', '已', '經', '經', '常', '聽', ...","['0', '0', '0', '0', '0', '0', '0', '0', '1', ..."


In [10]:
df = pd.concat([train_df, test_df, additional_df])
# convert tokens to list
df['tokens'] = df['tokens'].apply(lambda x: eval(x) if isinstance(x, str) else x)
# convert labels to list of ints (previously string of list of strings)
df['labels'] = df['labels'].apply(lambda x: [int(i) for i in eval(x)] if isinstance(x, str) else x)
# rename labels to cantonese_tags
df.rename(columns={'labels': 'cantonese_tags'}, inplace=True)
train_dataset = Dataset.from_pandas(df)
dataset = DatasetDict({
    'train': train_dataset
})

In [11]:
df

Unnamed: 0_level_0,sentence,has_cantonese,tokens,cantonese_tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ASTRI0000,仍記得小學下課的時候，我總愛跟表弟到草推裏捉蠶蟲，每當打開葉子時看到蟲子，心中總有無限的喜悅。,False,"[仍, 記, 得, 小, 學, 下, 課, 的, 時, 候, ，, 我, 總, 愛, 跟, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
ASTRI0001,我獨自站在火山的腳下，望著天，靜候著山上的紅川湧流將我掩沒的一刻。,False,"[我, 獨, 自, 站, 在, 火, 山, 的, 腳, 下, ，, 望, 著, 天, ，, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
ASTRI0002,來到百生的學校，我並不害怕，反而覺得這學校比之前的學校大，感覺很好。,False,"[來, 到, 百, 生, 的, 學, 校, ，, 我, 並, 不, 害, 怕, ，, 反, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
ASTRI0003,今天是平安夜，街道上火樹銀花、燈火通明；汽車熙來攘往、如即如梭。,False,"[今, 天, 是, 平, 安, 夜, ，, 街, 道, 上, 火, 樹, 銀, 花, 、, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
ASTRI0004,「借問酒家何處有，牧童遙指杏花村。」然而在現在，這樣的事情畫意又從何而尋呢。,False,"[「, 借, 問, 酒, 家, 何, 處, 有, ，, 牧, 童, 遙, 指, 杏, 花, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
ADD57,死亡是生存的對頭人，人們為了抵抗死亡，頑強地死撐下去；但人們始終敵不過命運的擺弄，乖乖而又無...,True,"[死, 亡, 是, 生, 存, 的, 對, 頭, 人, ，, 人, 們, 為, 了, 抵, ...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
ADD58,雨下過就停，雲散了就不會再埋堆，相同地，時間過了就不會再返來。,True,"[雨, 下, 過, 就, 停, ，, 雲, 散, 了, 就, 不, 會, 再, 埋, 堆, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ..."
ADD59,從小到大我都生長於一個愉快、食飽無憂米的環境，屋企人都相處融洽。,True,"[從, 小, 到, 大, 我, 都, 生, 長, 於, 一, 個, 愉, 快, 、, 食, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
ADD60,小時候我已經經常聽聞話男生發育比較晚，所以小學階段的女生成績比較好。,True,"[小, 時, 候, 我, 已, 經, 經, 常, 聽, 聞, 話, 男, 生, 發, 育, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, ..."


In [64]:
dataset.save_to_disk('data/nlptea_dataset')

Saving the dataset (1/1 shards): 100%|██████████| 2062/2062 [00:00<00:00, 181340.13 examples/s]


In [13]:
df.to_csv('data/nlptea.csv')