In [None]:
#在这个内核中，我想说明在构建深度学习NLP模型时如何进行有意义的预处理。
#我开始的两条黄金法则:
#1.使用标准的预处理步骤不喜欢阻止或stopword切除时pre-trained嵌入
#一些您可能使用标准的预处理步骤时基于字数等特征提取(例如TFIDF)删除stopwords,引发等。原因很简单:你宽松的有价值的信息,这将有助于你神经网络图的东西。
#2.让你的词汇量尽可能接近嵌入
#我将集中在这个笔记本，如何实现这一点。以GoogleNews预培训的嵌入式为例，这种选择没有更深层次的原因。

In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:
# 加载数据
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
print('Train shape:', train.shape)
print('Test shape:', test.shape)

Train shape: (1306122, 3)
Test shape: (56370, 2)


In [3]:
#我将使用下面的函数来跟踪我们的训练词汇，它将遍历我们的所有文本并计算包含的单词的出现次数。
def build_vocab(sentences, verbose = True):
    # 参数  sentences list of list of words，就是二维的
    # 返回值 对应  词和词的次数 的字典
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [4]:
#因此，让我们填充词汇表并显示前5个元素及其计数。注意，现在我们可以使用progess_apply查看进度条
sentences = train['question_text'].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|███| 1306122/1306122 [00:04<00:00, 266825.76it/s]
100%|███| 1306122/1306122 [00:05<00:00, 254818.90it/s]


{'How': 261930, 'did': 33489, 'Quebec': 97, 'nationalists': 91, 'see': 9003}


In [6]:
#接下来，我们导入我们稍后要在模型中使用的Embedding。为了说明这一点，我在这里使用GoogleNews
from gensim.models import KeyedVectors

news_path = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embedding_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [15]:
#接下来，我定义一个函数来检查词汇表和嵌入之间的交集。它将输出一个out of vocabulary (oov)单词列表，我们可以使用它来改进我们的预处理
import operator
def check_coverage(vocab, embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:
            oov[word] = vocab[word]
            i += vocab[word]
            pass
    
    print('Found embeddings for {:.2%} of vocab'.format(len(a) /  len(vocab)))
    print('Found embeddings for {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]#取axis=1维度进行排序，并换为逆序
    
    return sorted_x

In [16]:
oov = check_coverage(vocab, embedding_index)

100%|█████| 508823/508823 [00:01<00:00, 374703.65it/s]


Found embeddings for 24.31% of vocab
Found embeddings for 78.75% of all text


In [17]:
#哎哟，只有24%的词汇表会有嵌入，这使得21%的数据或多或少是无用的。所以让我们来看看并开始改进。为此，我们可以很容易地看一看顶部的oov单词
oov[:10]

[('to', 403183),
 ('a', 402682),
 ('of', 330825),
 ('and', 251973),
 ('India?', 16384),
 ('it?', 12900),
 ('do?', 8753),
 ('life?', 7753),
 ('you?', 6295),
 ('me?', 6202)]

In [18]:
#：首先是“to”。为什么?仅仅是因为“to”在训练GoogleNews嵌入时被删除了。稍后我们将对此进行修复，因为现在我们要注意标点符号的分割，因为这似乎也是一个问题。但是，我们该如何处理标点符号呢?我们是想删除标点符号，还是将其视为一种标记?我想说:这要看情况。如果标记有嵌入，保留它，如果没有，我们就不再需要它了。我们检查:
print('?' in embedding_index)
print('&' in embedding_index)

False
True


In [19]:
#有趣。虽然“&”出现在谷歌新闻的嵌入中，“?”却不是。因此，我们基本上定义了一个函数，它分割“&”并删除其他标点符号。

def clean_text(x):
    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')#f''解释：你不再需要直接调用一个字符串的.format()方法，但是要简单地用前缀f来标记格式以及内联最终字符串中你想要包括的表达式，不然它们就会被期望着去提供如同你从.format()函数得到的相同功能。这些格式化字符串也在文档中被称为“f字符串”。
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [21]:
train['question_text'] =  train['question_text'].progress_apply(lambda x: clean_text(x))
sentences = train['question_text'].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|████| 1306122/1306122 [00:16<00:00, 79190.25it/s]
100%|███| 1306122/1306122 [00:05<00:00, 218846.61it/s]


In [23]:
oov = check_coverage(vocab, embedding_index)

100%|█████| 253623/253623 [00:00<00:00, 329145.72it/s]


Found embeddings for 57.38% of vocab
Found embeddings for 89.99% of all text


In [25]:
#好了!我们能够增加我们的嵌入比从24%到57%仅仅通过处理穿刺。好的，让我们检查一下这些单词。
oov[:10]

[('to', 406298),
 ('a', 403852),
 ('of', 332964),
 ('and', 254081),
 ('2017', 8781),
 ('2018', 7373),
 ('10', 6642),
 ('12', 3694),
 ('20', 2942),
 ('100', 2883)]

In [30]:
#嗯，似乎数字也是个问题。让我们检查一下前10个嵌入来获得线索。
for i in range(10):
    print(embedding_index.index2entity[i])

AttributeError: 'KeyedVectors' object has no attribute 'index2entity'

In [31]:
#为什么里面有"##" ?原因很简单，因为作为一个再处理，所有大于9的数字都被hashs替换了。即成为# #,123变成# # #或15.80€变成# #,# #€。因此，让我们模拟这个预处理步骤来进一步改进我们的嵌入式覆盖率
import re

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [33]:
train['question_text'] = train['question_text'].progress_apply(lambda x: clean_numbers(x))
sentences = train['question_text'].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|████| 1306122/1306122 [00:16<00:00, 80227.57it/s]
100%|███| 1306122/1306122 [00:05<00:00, 249280.32it/s]
100%|███| 1306122/1306122 [00:05<00:00, 259681.59it/s]


In [37]:
oov = check_coverage(vocab,embedding_index)

100%|█████| 242997/242997 [00:00<00:00, 319105.53it/s]


Found embeddings for 60.41% of vocab
Found embeddings for 90.75% of all text


In [24]:
#好了!另一个3%的增长。现在就像处理撞击一样，但是每一点都有帮助。让我们检查oov单词
oov[:20]

[('to', 406298),
 ('a', 403852),
 ('of', 332964),
 ('and', 254081),
 ('2017', 8781),
 ('2018', 7373),
 ('10', 6642),
 ('12', 3694),
 ('20', 2942),
 ('100', 2883),
 ('15', 2762),
 ('12th', 2551),
 ('11', 2356),
 ('30', 2163),
 ('18', 2066),
 ('50', 1993),
 ('16', 1589),
 ('14', 1533),
 ('17', 1505),
 ('13', 1390)]

In [38]:
#好了，现在我们来处理一下在使用美式/英式vocab时常见的拼写错误，并将一些“现代”单词替换为“social media”。此外，我们将简单地删除“a”、“to”、“and”和“of”等词，因为在培训GoogleNews嵌入式时，这些词显然已被下采样。
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))#编写一个正则式
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

In [39]:
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [47]:
train["question_text"] = train["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
sentences = train["question_text"].progress_apply(lambda x: x.split())
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)


  0%|                     | 0/1306122 [00:00<?, ?it/s]
  1%|     | 11727/1306122 [00:00<00:11, 116416.34it/s]
  2%|     | 30571/1306122 [00:00<00:09, 131288.09it/s]
  4%|▏    | 49145/1306122 [00:00<00:08, 143715.22it/s]
  5%|▎    | 68121/1306122 [00:00<00:08, 154721.00it/s]
  7%|▎    | 87093/1306122 [00:00<00:07, 163476.85it/s]
  8%|▎   | 105626/1306122 [00:00<00:07, 169118.02it/s]
 10%|▍   | 124101/1306122 [00:00<00:06, 173166.11it/s]
 11%|▍   | 143467/1306122 [00:00<00:06, 178484.38it/s]
 12%|▍   | 163252/1306122 [00:00<00:06, 183508.04it/s]
 14%|▌   | 181392/1306122 [00:01<00:06, 181354.71it/s]
 15%|▌   | 199386/1306122 [00:01<00:06, 174507.78it/s]
 17%|▋   | 216793/1306122 [00:01<00:06, 163104.15it/s]
 18%|▋   | 234639/1306122 [00:01<00:06, 167386.47it/s]
 19%|▊   | 254011/1306122 [00:01<00:06, 174388.73it/s]
 21%|▊   | 273563/1306122 [00:01<00:05, 179869.00it/s]
 22%|▉   | 291702/1306122 [00:01<00:05, 177298.68it/s]
 24%|▉   | 311585/1306122 [00:01<00:05, 182867.13it/s]
 25%|█   

In [48]:
oov = check_coverage(vocab, embedding_index)


  0%|                      | 0/242935 [00:00<?, ?it/s]
  0%|          | 160/242935 [00:00<02:32, 1588.38it/s]
  0%|          | 548/242935 [00:00<02:05, 1928.33it/s]
  1%|         | 1552/242935 [00:00<01:34, 2543.84it/s]
  2%|▏        | 4969/242935 [00:00<01:07, 3520.90it/s]
  4%|▎        | 9448/242935 [00:00<00:47, 4864.76it/s]
  6%|▍       | 15121/242935 [00:00<00:33, 6701.57it/s]
  9%|▊       | 22964/242935 [00:00<00:23, 9233.08it/s]
 14%|▉      | 34243/242935 [00:00<00:16, 12739.90it/s]
 20%|█▍     | 48984/242935 [00:00<00:11, 17545.09it/s]
 27%|█▊     | 64970/242935 [00:01<00:07, 23930.84it/s]
 34%|██▎    | 82019/242935 [00:01<00:04, 32242.48it/s]
 42%|██▍   | 100951/242935 [00:01<00:03, 42906.98it/s]
 50%|██▉   | 120275/242935 [00:01<00:02, 55932.64it/s]
 59%|███▌  | 143094/242935 [00:01<00:01, 72259.87it/s]
 66%|███▉  | 161090/242935 [00:01<00:01, 80879.94it/s]
 76%|███▊ | 184088/242935 [00:01<00:00, 100312.85it/s]
 83%|████▏| 202109/242935 [00:01<00:00, 104330.03it/s]
 93%|████

Found embeddings for 60.43% of vocab
Found embeddings for 98.96% of all text


In [49]:
#我们发现，尽管我们改进了所有文本的嵌入量，从89%提高到99%。让我们再检查一遍oov单词
oov[:20]

[('bitcoin', 987),
 ('Quorans', 858),
 ('cryptocurrency', 822),
 ('Snapchat', 807),
 ('btech', 632),
 ('Brexit', 493),
 ('cryptocurrencies', 481),
 ('blockchain', 474),
 ('behaviour', 468),
 ('upvotes', 432),
 ('programme', 402),
 ('Redmi', 379),
 ('realise', 371),
 ('defence', 364),
 ('KVPY', 349),
 ('Paytm', 334),
 ('grey', 299),
 ('mtech', 281),
 ('Btech', 262),
 ('bitcoins', 254)]

In [50]:
#看起来不错。没有明显的oov词，我们可以快速修复。谢谢你的阅读和快乐的kaggling

In [51]:
train.to_csv('process_train.csv')

In [54]:
import pickle

with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)
    
# def load_obj(name ):
#     with open('obj/' + name + '.pkl', 'rb') as f:
#         return pickle.load(f)

In [56]:
f = open('vocab.txt','w', encoding='utf-8')
f.write(str(vocab))
f.close()