In [58]:
import gc
import operator 
import re
import keras
import pandas as pd
import numpy as np
import seaborn as snss
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
from tqdm import tqdm
tqdm.pandas()

## Loading data

In [59]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
df = pd.concat([train ,test], sort=False)
print("Number of texts: ", df.shape[0])

Number of texts:  1362492


## Loading embeddings

In [60]:
def load_emb(filename):
    def get_coefs(word, *arr): 
        return word, np.asarray(arr, dtype='float32')
    if "wiki-news-300d-1M.vec" in filename:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(filename) if len(o)>100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(filename, encoding='latin'))
    return embeddings_index

glove = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
print("Extracting GloVe embedding")
emb_glove = load_emb(glove)

Extracting GloVe embedding


## Vocabulary and Coverage functions

In [61]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
    print('Found embeddings for {:.3%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.3%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]
    return unknown_words

## Starting point

In [62]:
vocab = build_vocab(df['question_text'])
print("Glove : ")
oov_glove = check_coverage(vocab, emb_glove)

Glove : 
Found embeddings for 32.773% of vocab
Found embeddings for  88.149% of all text


Apply lowerization (necessary if using paragram)
> Fasttext and GloVe understand capitalization, but you still win a bit.

In [63]:
df['lowered_question'] = df['question_text'].apply(lambda x: x.lower())

If you apply lowerization, you lost a bit of informations on other embeddings:
> Therer are words known that are known with upper letters and unknown without. Let us fix that:
* word.lower() takes the embedding of word if word.lower() doesn't have an embedding

In [64]:
def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")

In [65]:
print("Glove: ")
oov_glove = check_coverage(vocab, emb_glove)
add_lower(emb_glove, vocab)
oov_glove = check_coverage(vocab, emb_glove)

Glove: 
Found embeddings for 32.773% of vocab
Found embeddings for  88.149% of all text
Added 15199 words to embedding
Found embeddings for 33.029% of vocab
Found embeddings for  88.164% of all text


## What's wrong?

In [66]:
oov_glove[:10]

[('India?', 17082),
 ('it?', 13436),
 ("What's", 12985),
 ('do?', 9112),
 ('life?', 8074),
 ('you?', 6553),
 ('me?', 6485),
 ('them?', 6421),
 ('time?', 5994),
 ('world?', 5632)]

First faults appearing are:

* Contractions
* Words with punctuation in them

> Let us correct that.

## Contractions

In [67]:
contractions = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because",
                "could've": "could have", "couldn't": "could not", "didn't": "did not",
                "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not",
                "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is",
                "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have",
                "i'm": "i am", "i've": "i have", "i'd": "i would", "i'd've": "i would have",
                "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have",
                "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will",
                "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have",
                "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
                "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not",
                "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have",
                "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have",
                "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is",
                "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is",
                "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
                "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will",
                "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not",
                "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is",
                "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
                "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have",
                "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have",
                "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have",
                "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
                "y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will",
                "you'll've": "you will have", "you're": "you are", "you've": "you have" }

In [68]:
def known_contractions(emb):
    known = []
    for contract in contractions:
        if contract in emb:
            known.append(contract)
    return known

In [69]:
print("- Known Contractions -")
print("  Glove:")
print(known_contractions(emb_glove))

- Known Contractions -
  Glove:
["can't", "'cause", "didn't", "doesn't", "don't", "i'd", "i'll", "i'm", "i've", "it's", "ma'am", "o'clock", "that's", "you'll", "you're"]


Not a lot of contractions are known. (FastText knows none)
> We use the map to replace them

In [70]:
def clean_contractions(text, mapping):
    specials = ['”', '´', '‘', '“', '’', '¨', '‛', "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [71]:
df['treated_question'] = df['lowered_question'].progress_apply(lambda x: clean_contractions(x, contractions))

100%|██████████| 1362492/1362492 [00:04<00:00, 279577.70it/s]


In [72]:
df['treated_question'].head()

0    how did quebec nationalists see their province...
1    do you have an adopted dog, how would you enco...
2    why does velocity affect time? does velocity a...
3    how did otto von guericke used the magdeburg h...
4    can i convert montra helicon d to a mountain b...
Name: treated_question, dtype: object

In [73]:
vocab = build_vocab(df['treated_question'])
print("Glove: ")
oov_glove = check_coverage(vocab, emb_glove)

Glove: 
Found embeddings for 30.630% of vocab
Found embeddings for  88.563% of all text


## Now, let us deal with special characters

In [74]:
# Get set of all no_alnum in dataset
tmp = []
for x in df.question_text:
    for c in x:
        if not c.isalnum():
            tmp.append(c)
for x in test.question_text:
    for c in x:
        if not c.isalnum():
            tmp.append(c)
puncs = set(tmp) - set(' ')
print(puncs)

{'%', 'ំ', '♨', '⎛', '?', '√', '∝', '„', '=', '̈́', '@', '◌', 'ौ', '−', '¨', 'া', '㏑', '£', '͗', '“', '–', 'ு', '\x10', '≈', '∩', '̢', 'ా', '▒', '̳', '\xad', '\x8d', '＝', '∆', '।', '̾', '̼', '✅', '̕', '\u202c', '்', 'ះ', '͔', '《', '↓', '̓', '¯', '્', '्', 'ุ', '̖', '\uf0d8', '‛', '̶', '\u2061', '₦', '∴', '̔', '̤', '{', '®', '̴', 'ั', 'ూ', '∂', 'ि', '*', 'ॄ', '͋', '∞', 'ा', 'ి', '͆', '̃', '・', '̒', '℃', '∖', '₊', '̦', '\x06', '~', '↑', '\x1b', "'", '^', '‘', '¡', '˜', '／', '്', '͑', '[', '⦁', '»', '❓', '☁', '∗', '∼', '̮', 'ோ', '̷', '͒', '‰', 'ા', '㏒', '∫', '̭', '\u200c', 'ை', 'ٌ', '€', 'ি', '∑', '่', 'ো', '̉', 'ู', '∛', '\x01', '…', '͡', '§', 'ା', '⚧', '̜', 'ਾ', 'ॉ', '′', '͎', '⟨', '̈', '<', '？', '}', 'ు', '✔', 'ਿ', 'ี', '、', 'ं', '#', '－', '̽', '\u200e', '̌', '∈', 'ா', 'े', 'ी', '\x13', 'ੁ', '÷', '♡', '，', '̀', '〗', '_', '＄', '†', '¬', '̱', '͘', '⎞', '¸', '\u202a', ')', 'ీ', '\\', '´', '̘', 'ֿ', '⬇', '×', '▾', '⁻', '⊆', '→', 'ិ', 'ৃ', '̑', '̵', 'ّ', '̹', '/', '้', '\u200f', 'ः', '͈', '∡

In [75]:
# Get set of all no_alpha in dataset
tmp = []
for x in df.question_text:
    for c in x:
        if not c.isalpha():
            tmp.append(c)
for x in test.question_text:
    for c in x:
        if not c.isalpha():
            tmp.append(c)
no_alpha = set(tmp) - set(' ')
print(no_alpha)

{'%', 'ំ', '♨', '⎛', '?', '√', '∝', '„', '=', '̈́', '@', '◌', 'ौ', '−', '¨', 'া', '㏑', '£', '2', '͗', '“', '–', 'ு', '\x10', '≈', '∩', '̢', '⁸', 'ా', '▒', '̳', '\xad', '\x8d', '＝', '२', '∆', '।', '̾', '̼', '✅', '̕', '6', '₃', '\u202c', '்', 'ះ', '͔', '《', '↓', '̓', '¯', '્', '्', 'ุ', '̖', '\uf0d8', '‛', '4', '̶', '\u2061', '₦', '∴', '̔', '̤', '{', '®', '̴', 'ั', 'ూ', '∂', 'ि', '*', 'ॄ', '¼', '͋', '∞', 'ा', 'ి', '͆', '9', '̃', '・', '̒', '℃', '∖', '₊', '̦', '\x06', '~', '↑', '\x1b', "'", '^', '‘', '¡', '˜', '／', '്', '͑', '[', '⦁', '»', '❓', '☁', '∗', '∼', '̮', 'ோ', '̷', '͒', '‰', 'ા', '㏒', '∫', '̭', '\u200c', 'ை', 'ٌ', '€', 'ি', '∑', '่', 'ো', '̉', 'ู', '∛', '\x01', '…', '͡', '§', 'ା', '⚧', '̜', 'ਾ', 'ॉ', '′', '͎', '⟨', '̈', '<', '？', '}', 'ు', '✔', 'ਿ', 'ี', '³', '、', 'ं', '#', '－', '̽', '\u200e', '̌', '∈', 'ா', 'े', 'ी', '\x13', 'ੁ', '÷', '♡', '，', '̀', '〗', '₆', '_', '＄', '†', '¬', '̱', '͘', '⎞', '¸', '\u202a', '₁', ')', 'ీ', '\\', '´', '̘', 'ֿ', '⬇', '×', '▾', '⁻', '⁷', '⊆', '→', 'ិ

In [76]:
def unknown_punc(emb, puncs):
    unknown = ''
    for p in puncs:
        if p not in emb:
            unknown += f'"{p}": "", '
    return '{' + unknown + '}'

In [77]:
print("Glove: ")
print(unknown_punc(emb_glove, puncs))
print(unknown_punc(emb_glove, no_alpha))

Glove: 
{"ំ": "", "♨": "", "⎛": "", "√": "", "∝": "", "„": "", "̈́": "", "◌": "", "ौ": "", "−": "", "¨": "", "া": "", "㏑": "", "£": "", "͗": "", "“": "", "–": "", "ு": "", "": "", "≈": "", "∩": "", "̢": "", "ా": "", "▒": "", "̳": "", "­": "", "": "", "＝": "", "∆": "", "।": "", "̾": "", "̼": "", "✅": "", "̕": "", "‬": "", "்": "", "ះ": "", "͔": "", "《": "", "↓": "", "̓": "", "¯": "", "્": "", "्": "", "ุ": "", "̖": "", "": "", "‛": "", "̶": "", "⁡": "", "₦": "", "∴": "", "̔": "", "̤": "", "®": "", "̴": "", "ั": "", "ూ": "", "∂": "", "ि": "", "ॄ": "", "͋": "", "∞": "", "ा": "", "ి": "", "͆": "", "̃": "", "・": "", "̒": "", "℃": "", "∖": "", "₊": "", "̦": "", "": "", "↑": "", "": "", "‘": "", "¡": "", "˜": "", "／": "", "്": "", "͑": "", "⦁": "", "»": "", "❓": "", "☁": "", "∗": "", "∼": "", "̮": "", "ோ": "", "̷": "", "͒": "", "‰": "", "ા": "", "㏒": "", "∫": "", "̭": "", "‌": "", "ை": "", "ٌ": "", "€": "", "ি": "", "∑": "", "่": "", "ো": "", "̉": "", "ู": "", "∛": "", "": "", "…": "", "

Some characters are unknown.
> We use a map to replace unknown characters with known ones.

> We make sure there are spaces between words and punctuation

In [78]:
punc_mapping = {"ँ": "", "◦": "", "̆": "", "✏": "", "": "", "ี": "", "♡": "o", "△": "", "⇒": "",
                 "": "", "＄": " dollar ", "∛": " sqrt ", "→": "", "͚": "", "️": "", "⟩": "", "¡": "i", "∴": " so ",
                 "್": "", "‬": "", "̘": "", "ា": "", "¿": "?", "⧼": "", "": "", "®": " r ", "∫": " calculus ",
                 "ौ": "", "∼": "", "َ": "", "ూ": "", "”": "'", "̙": "", "⋅": "", "̷": "", "̓": "",
                 "、": "", "⬇": "", "̔": "", "∗": "*", "͕": "", "͡": "", "̿": "", "‌": "", "͜": "", "̦": "",
                 "": "", "♨": "", "̮": "", "✓": " sqrt ", "ௌ": "", "»": " ", "➡": "", "̼": "", "̌": "",
                 "̢": "", "？": "?", "": "", "ৃ": "", "ం": "", "⊥": "", "̧": "", "ਾ": "", "》": " ",
                 "ਂ": "", "ិ": "", "∨": "", "ী": "", "े": "", "⧽": "", "⁡": "", "∀": " any ", "ु": "",
                 "ٌ": "", "₦": " naira ", "∡": " angle ", "̸": "", "़": "", "̃": "", "": "", "͎": "", "∧": "", "，": "",
                 "÷": "/", "،": "", "↓": "", "✔": "", "∩": " intersection ", "⁠": "", "¶": "", "ೋ": "", "͖": "",
                 "ে": "", "☝": "", "«": " ", "": "", "ं": "", "《": " ", "ॉ": "", "）": "", "͉": "",
                 "⟨": "", "": "", "ْ": "", "‏": "", "₱": " peso ", "°": "", "͋": "", "✌": "", "্": "", "᠌": "",
                 "♣": "", "×": "x", "ো": "", "؟": "?", "˜": "", "̩": "", "̱": "", "̺": "", "͔": "",
                 "▾": "", "⎛": "", "ొ": "", "்": "", "̊": "", "̥": "", "ੁ": "", "่": "", "﻿": "", "˚": "",
                 "㏒": " log ", "ా": "", "ા": "", "™": " tm ", "ِ": "", "∈": "", "⃗": "", "≅": "=", "̵": "", "♭": "",
                 "ಾ": "", "；": ".", "̒": "", "ி": "", "´": "'", "＞": ">", "̣": "", "ุ": "", "ّ": "", "▒": "",
                 "।": "", "–": "-", "∖": "", "̰": "", "ॄ": "", "‘": "'", "̶": "-", "ो": "", "！": "!", "☺": "",
                 "̎": "", "″": "", "＝": "=", "˂": "", "਼": "", "ः": "", "ֿ": "", "♏": "", "¦": "", "̝": "",
                 "∪": " union ", "̈": "", "́": "", "‐": "-", "“": "'", "ാ": "", "≤": "<=", "ੀ": "", "": "", "\n": "",
                 "◌": "", "§": "", "ृ": "", "ு": "", "ा": "", "√": " sqrt ", "¥": " yen ", "‑": "-", "￼": "", "": "",
                 "्": "", "̭": "", "": "", "¬": "", "͌": "", "̍": "", "„": "", "ី": "", "•": "", "↑": "",
                 "͘": "", "": "", "͇": "", "̫": "", "ா": "", "͛": "", "︠": "", "⁻": "-", "᾽": "", "ি": "",
                 "̟": "", "│": "|", "̕": "", "͊": "", "∠": " angle ", "̑": "", "‎": "", "㏑": " ln ", "☁": "", "ಿ": "",
                 "ी": "", "̀": "", "়": "", "̐": "", "☉": "", "": "", "⚧": "", "£": " pound ", "・": "", "⋯": "...",
                 "−": "-", "∅": " ", "¸": ",", "̋": "", "̲": "", "⎝": "", "͆": "", "〗": "]", "／": "", "ั": "",
                 "：": "", "ோ": "", "̽": "", "∑": " sum ", "©": " c ", "": "", "്": "", "ು": "", "ు": "", "్": "",
                 "∞": " infinity ", "ि": "", "⊨": "", "̈́": "", "̚": "", "̖": "", "̡": "", "⊆": "", "·": ".", "✅": "",
                 "ͅ": "", "ੰ": "", "̾": "", "…": "", "＾": "^", "≈": "=", "—": "-", "♀": "", "❤": "", "્": "",
                 "ା": "", "¢": "", "⎞": "", "ె": "", "​": "", "̻": "", "（": "", "‪": "", "≠": "!=", "ॢ": "",
                 "ં": "", "〖": "[", "­": "", "∂": "", "̬": "", "͐": "", "": "", "₊": "+", "℅": "%", "̛": "",
                 "‰": "", "ਿ": "", "͈": "", "́": "", "͂": "", "̞": "", "ి": "", "้": "", "̗": "", "ു": "",
                 "": "", "’": "'", "া": "", "ើ": "", "": "", "ះ": "", "」": "]", "︡": "", "ू": "",
                 "̳": "", "ை": "", "⊂": "", "∇": "", "≥": ">=", "̄": "", "₹": " e ", "̜": "", "̴": "",
                 "℃": "", "±": "+", "⌚": " time ", "≡": "", "̹": "", "̯": "", "′": "", "ీ": "", "ូ": "", "－": " ",
                 "「": "[", "̀": "", "¨": "'", "ॣ": "", "⦁": "", "€": " euro ", "❓": "?", "ู": "", "͗": "", "̅": "",
                 "̂": "", "͠": "", "̤": "", "្": "", "̉": "", "₩": "", "": "", "̪": "", "ै": "", "∘": "",
                 "ៃ": "", "͑": "", "ំ": "", "͒": "", "☹": "", "͝": "", "‛": "'", "⎠": "", "¯": "", "。": ".",
                 "∆": "", "ി": "", "̓": "", "∝": "", "†": "", "≱": "", "²": "2", 'θ': 'theta', "`": "'",
                 'α': 'alpha', 'à': 'a', 'β': 'beta', '³': '3', 'π': 'pi', "₁": "1", "₃": "3", "₆": "6", "¼": "1/4",
                 "⁷": "7", "¾": "3/4", "⁵": "5", "₅": "5", "½": "1/2", "₄": "4", "⅔": "2/3", "₂": "2", "¹": "1"}

In [79]:
single_quote = [k for k in punc_mapping if punc_mapping[k] == "'"]
print(single_quote)

['”', '´', '‘', '“', '’', '¨', '‛', '`']


In [80]:
punc_re = re.compile('(%s)' % '|'.join(punc_mapping.keys()))
def replace_punc(text):
    def replace(match):
        return punc_mapping[match.group(0)]
    return punc_re.sub(replace, text)

def sep_punc(x):
    for p in puncs:
        x = x.replace(p, f' {p} ')
    return x

def clean_special_chars(text, punc, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    for p in punc:
        text = text.replace(p, f' {p} ')
    return text

In [81]:
df['treated_question'] = df['treated_question'].progress_apply(lambda x: clean_special_chars(x, puncs, punc_mapping))
# 以下两步和 clean_special_chars 功能一样，但是效果不如 clean_special_chars，有待验证
# df['treated_question'] = df['treated_question'].progress_apply(lambda x: replace_punc(x))
# df['treated_question'] = df['treated_question'].progress_apply(lambda x: sep_punc(x))

100%|██████████| 1362492/1362492 [02:25<00:00, 9351.20it/s]


In [43]:
vocab = build_vocab(df['treated_question'])
print("Glove: ")
oov_glove = check_coverage(vocab, emb_glove)

Glove: 
Found embeddings for 69.213% of vocab
Found embeddings for  99.579% of all text


In [44]:
print(len(oov_glove))
oov_glove[:1000]

61581


[('quorans', 885),
 ('brexit', 542),
 ('cryptocurrencies', 525),
 ('redmi', 398),
 ('coinbase', 150),
 ('oneplus', 144),
 ('uceed', 126),
 ('demonetisation', 118),
 ('bhakts', 118),
 ('upwork', 117),
 ('pokémon', 117),
 ('machedo', 112),
 ('gdpr', 110),
 ('adityanath', 108),
 ('bnbr', 105),
 ('boruto', 105),
 ('alshamsi', 100),
 ('dceu', 94),
 ('iiest', 91),
 ('litecoin', 90),
 ('unacademy', 89),
 ('sjws', 89),
 ('zerodha', 85),
 ('qoura', 85),
 ('tensorflow', 82),
 ('fiancé', 76),
 ('lnmiit', 73),
 ('kavalireddi', 71),
 ('doklam', 70),
 ('muoet', 68),
 ('nicmar', 66),
 ('vajiram', 62),
 ('srmjee', 61),
 ('adhaar', 60),
 ('altcoin', 59),
 ('zebpay', 58),
 ('elitmus', 58),
 ('jiren', 56),
 ('altcoins', 56),
 ('awdhesh', 55),
 ('hackerrank', 54),
 ('ryzen', 51),
 ('koinex', 50),
 ('baahubali', 48),
 ('mhcet', 47),
 ('byju', 47),
 ('binance', 46),
 ('srmjeee', 44),
 ('beerus', 43),
 ('ftre', 42),
 ('skripal', 42),
 ('sgsits', 42),
 ('mhtcet', 40),
 ('hotstar', 40),
 ('gurugram', 40),
 ('b

What's still missing ?
> * Unknown words
> * Acronyms
> * Spelling mistakes

### We can correct manually most frequent mispells
For example, here are some mistakes and their frequency

    qoura : 85 times
    mastrubation : 38 times
    demonitisation : 30 times
    …

In [45]:
mispell = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling',
           'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor',
           'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu': 'youtube ',
           'qoura': 'quora', 'quorans': 'quora users', 'quoran': 'quora user', 'sallary': 'salary', 'whta': 'what',
           'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much',
           'howmany': 'how many', 'whydo': 'why do', 'doi': 'do i', 'thebest': 'the best', 'howdoes': 'how does',
           'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating',
           'pennis': 'penis', 'etherium': 'ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data',
           '2k15': '2015', '2k16': '2016', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend',
           'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization',
           'demonitization': 'demonetization', 'demonetisation': 'demonetization', 'pokémon': 'pokemon',
           'nanodegree': 'nano degree', 'brexit': 'british exit', 'cryptocurrencies': 'crypto currencies',
           'coinbase': 'coin base', 'oneplus': 'one plus', 'redmi': 'red mi', 'GDPR': 'general data protection regulation',
           'DCEU': 'dc extended universe', 'litecoin': 'lite coin', 'unacademy': 'non academy', 'altcoin': 'bitcoin alternative',
           'altcoins': 'bitcoin alternative', 'sjw': 'social justice warriors', 'sjws': 'social justice warriors',
           'fiancé': 'fiance', 'microservices': 'micro services', 'bitconnect': 'bit connect', 'codeforces': 'code forces',
           'wannacry': 'wanna cry', 'onedrive': 'one drive', 'airpods': 'air pods', 'twinflame': 'twin flame',
           'undergraduation': 'under graduation', 'cos2x': 'cos 2 x', 'yourquote': 'your quote', 'xiomi': 'xiaomi',
           'undertale': 'under tale', 'genderfluid': 'gender fluid', 'são': 'sao', 'chapterwise': 'chapter wise',
           'deepmind': 'deep mind', '': '', 'arrowverse': 'arrow verse', 'overbrace': ' ', 'tensorflow': 'tensor flow',
           'hackerrank': 'hacker rank', 'microservice': 'micro service', 'reactjs': 'react js', 'hackerearth': 'hacker earth',
           'fiancée': 'fiance', 'blockchains': 'block chains', 'beyoncé': 'beyonce', 'neuralink': 'neura link',
           'openai': 'open ai', 'zoomcar': 'zoom car', 'hyperconjugation': 'hyper conjugation', 'autoencoder': 'auto encoder',
           'webassembly': 'web assembly', 'quoras': 'quora', 'digilocker': 'digi locker', 'oversmart': 'over smart',
           'cryptocoins': 'crypto coins', 'crytocurrencies': 'cryto currencies', 'cyrptocurrency': 'cyrpto currency',
           'café': 'cafe', 'whatapp': 'whatsapp', 'gaslighter': 'gas lighter', 'darkweb': 'dark web', 'webnovel': 'web novel'}

In [46]:
def correct_spelling(x, mapping):
    for word in mapping.keys():
        x = x.replace(word, mapping[word])
    return x

In [47]:
df['treated_question'] = df['treated_question'].progress_apply(lambda x: correct_spelling(x, mispell))

100%|██████████| 1362492/1362492 [00:24<00:00, 56168.74it/s]


In [48]:
vocab = build_vocab(df['treated_question'])
print("Glove : ")
oov_glove = check_coverage(vocab, emb_glove)

Glove : 
Found embeddings for 69.237% of vocab
Found embeddings for  99.602% of all text


In [50]:
print(len(oov_glove))
oov_glove

61472


[('uceed', 126),
 ('bhakts', 118),
 ('upwork', 117),
 ('machedo', 112),
 ('gdpr', 110),
 ('adityanath', 108),
 ('bnbr', 105),
 ('boruto', 105),
 ('alshamsi', 100),
 ('dceu', 94),
 ('iiest', 91),
 ('warriorss', 89),
 ('zerodha', 85),
 ('lnmiit', 73),
 ('kavalireddi', 71),
 ('doklam', 70),
 ('muoet', 68),
 ('nicmar', 66),
 ('vajiram', 62),
 ('srmjee', 61),
 ('adhaar', 60),
 ('zebpay', 58),
 ('elitmus', 58),
 ('jiren', 56),
 ('awdhesh', 55),
 ('ryzen', 51),
 ('koinex', 50),
 ('baahubali', 48),
 ('mhcet', 47),
 ('byju', 47),
 ('binance', 46),
 ('srmjeee', 44),
 ('beerus', 43),
 ('ftre', 42),
 ('skripal', 42),
 ('sgsits', 42),
 ('mhtcet', 40),
 ('hotstar', 40),
 ('gurugram', 40),
 ('bipc', 39),
 ('lbsnaa', 38),
 ('bmsce', 38),
 ('jiofi', 38),
 ('sarahah', 36),
 ('ravula', 36),
 ('swachh', 36),
 ('obor', 35),
 ('usict', 34),
 ('patreon', 34),
 ('clickbait', 34),
 ('zenfone', 33),
 ('iisers', 33),
 ('bittrex', 32),
 ('chromecast', 32),
 ('pessat', 31),
 ('jungkook', 30),
 ('xxxtentacion', 30)

In [52]:
# # 外部拼写纠错工具（平台没有，需要申请安装）
# import enchant

# mispell_mapping = {}
# d = enchant.Dict("en_US")
# min_count = 5

# for item in oov_glove:
#     w = item[0]
#     if item[1] >= min_count:
#         r = d.suggest(w)
#         if r:
#             mispell_mapping[w] = r[0]
#         else:
#             mispell_mapping[w] = ''

# print(mispell_mapping)

In [53]:
# df['treated_question'] = df['treated_question'].progress_apply(lambda x: correct_spelling(x, mispell_mapping))

In [54]:
# vocab = build_vocab(df['treated_question'])
# print("Glove : ")
# oov_glove = check_coverage(vocab, emb_glove)

In [55]:
def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [56]:
df['treated_question'] = df['treated_question'].progress_apply(lambda x: clean_numbers(x))

100%|██████████| 1362492/1362492 [00:10<00:00, 124320.14it/s]


In [57]:
vocab = build_vocab(df['treated_question'])
print("Glove : ")
oov_glove = check_coverage(vocab, emb_glove)

Glove : 
Found embeddings for 68.648% of vocab
Found embeddings for  99.512% of all text
