In [19]:
import pandas as pd
from datasets import load_dataset

In [20]:
# load the data
dataset = load_dataset('iwslt2017', 'iwslt2017-zh-en')

Found cached dataset iwslt2017 (/Users/kangwangkai/.cache/huggingface/datasets/iwslt2017/iwslt2017-zh-en/1.0.0/03ce9110373117c6f6687719f49f269486a8cd49dcad2527993a316cd4b6ad49)


  0%|          | 0/3 [00:00<?, ?it/s]

In [21]:
df_train = pd.DataFrame(dataset['train']['translation'])
df_val = pd.DataFrame(dataset['validation']['translation'])
df_test = pd.DataFrame(dataset['test']['translation'])

In [22]:
import spacy
import jieba
import nltk

In [23]:
# number of NEs
en_nlp = spacy.load("en_core_web_sm")
zh_nlp = spacy.load("zh_core_web_sm")

In [24]:
def is_well_translated(en_text, zh_text):
    en_doc = en_nlp(en_text)
    zh_doc = zh_nlp(zh_text)
    en_total_count = sum(1 for token in en_doc if token.pos_ not in ['PUNCT', 'SPACE'])
    zh_total_count = sum(1 for token in zh_doc if token.pos_ not in ['PUNCT', 'SPACE'])

    en_noun_count = sum(1 for token in en_doc if token.pos_ == 'PROPN' or token.pos_ == 'NOUN')
    zh_noun_count = sum(1 for token in zh_doc if token.pos_ == 'PROPN' or token.pos_ == 'NOUN')
    min_noun_count = min(en_noun_count, zh_noun_count)
    max_noun_count = max(en_noun_count, zh_noun_count)
    if max(en_total_count, zh_total_count) == 0:
        return False
    if max_noun_count == 0:
        return True
    else:
        return min(en_total_count, zh_total_count) / max(en_total_count, zh_total_count) >= 0.5 or (min_noun_count / max_noun_count) >= 0.5

In [25]:
df_train['well_translated'] = df_train.apply(lambda row: is_well_translated(row.en, row.zh), axis=1)

In [29]:
df_train[df_train['well_translated']==False]

Unnamed: 0,en,zh,well_translated,tokenized
135,So you could imagine they were very excited ab...,他们当时非常激动,False,"[So, you, could, imagine, they, were, very, ex..."
168,What we did was we found what were the locally...,我们还是就地取材,False,"[What, we, did, was, we, found, what, were, th..."
172,And you can see that it's a lot cleaner burnin...,很明显炭砖清洁得多,False,"[And, you, can, see, that, it, 's, a, lot, cle..."
197,which is also remarkable about this technology...,这项技术妙在 容易转让,False,"[which, is, also, remarkable, about, this, tec..."
263,We live in one world.,而是一个。,False,"[We, live, in, one, world, .]"
...,...,...,...,...
230477,There's a lot of information here.,这里包罗万象,False,"[There, 's, a, lot, of, information, here, .]"
230717,"Not explicitly, but implicitly.",虽然没有明确要求，但是 他们就是实际的执行者。,False,"[Not, explicitly, ,, but, implicitly, .]"
231028,We speak differently.,外向者和内向者的说话方式也不同。,False,"[We, speak, differently, .]"
231220,We survey the native tree species of the place.,我们调查当地树种,False,"[We, survey, the, native, tree, species, of, t..."


In [30]:
df_train

Unnamed: 0,en,zh,well_translated,tokenized
0,"Thank you so much, Chris. And it's truly a gre...",非常谢谢，克里斯。的确非常荣幸 能有第二次站在这个台上的机会，我真是非常感激。,True,"[Thank, you, so, much, ,, Chris, ., And, it, '..."
1,"I have been blown away by this conference, and...",这个会议真是让我感到惊叹不已，我还要谢谢你们留下的 关于我上次演讲的精彩评论,True,"[I, have, been, blown, away, by, this, confere..."
2,"And I say that sincerely, partly because I ne...",我是非常真诚的，部分原因是因为----我的确非常需要！ 你设身处地为我想想！,True,"[And, I, say, that, sincerely, ,, partly, beca..."
3,I flew on Air Force Two for eight years.,我坐了8年的空军二号。,True,"[I, flew, on, Air, Force, Two, for, eight, yea..."
4,Now I have to take off my shoes or boots to ge...,不过现在上飞机前我则要脱掉我的鞋子,True,"[Now, I, have, to, take, off, my, shoes, or, b..."
...,...,...,...,...
231261,It's a tiny jungle party.,就像这个小型的“丛林聚会”,True,"[It, 's, a, tiny, jungle, party, .]"
231262,This forest grows as a collective.,这片森林聚集了各种树种,True,"[This, forest, grows, as, a, collective, .]"
231263,If the same trees -- same species -- would hav...,如果将同一种树 同一个物种 分开种植的话 它不会长得这么快,True,"[If, the, same, trees, --, same, species, --, ..."
231264,And this is how we create a 100-year-old fores...,这就是在十年之内 种出一片百岁森林的办法,True,"[And, this, is, how, we, create, a, 100-year-o..."


In [149]:
df_val['well_translated'] = df_val.apply(lambda row: is_well_translated(row.en, row.zh), axis=1)

In [27]:
df_train['tokens'] = df_train.apply(lambda row: nltk.word_tokenize(row.en), axis=1)

In [154]:
df_train.to_csv("tokenized_train_data.csv", index=False)