In [21]:
import torch
import pandas as pd
from tqdm import tqdm
import os
tqdm.pandas()
import sys
import re

Based on https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings/notebook

In [2]:
base = os.getcwd()

In [3]:
embedding = os.path.join(base,'Base','glove.840B.300d','glove.840B.300d.txt')

In [4]:
trainfile = os.path.join(base,'Base','train.csv')
train = pd.read_csv(trainfile)

In [5]:
train.shape

(1306122, 3)

In [6]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [31]:
sentences = train["question_text"].progress_apply(lambda x: x.split()).values

100%|██████████| 1306122/1306122 [00:02<00:00, 470579.53it/s]


In [32]:
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:02<00:00, 461248.14it/s]


In [8]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec



In [9]:
glove2word2vec(embedding,"glove.txt")

(2196017, 300)

In [10]:
glove_embed = KeyedVectors.load_word2vec_format("glove.txt",binary=False)

In [11]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab): #if word from vocab in embedindex, assign to a and add to k
        try:
            a[word] = embeddings_index[word]
            k += vocab[word] #vocab is a dictonary of word counts
        except:# if word not in embed index, execute this loop

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [65]:
oov = check_coverage(vocab,glove_embed)

100%|██████████| 508823/508823 [00:00<00:00, 567489.65it/s]


Found embeddings for 33.16% of vocab
Found embeddings for  88.16% of all text


In [13]:
def split_qmark(x): #need to insert space before question marks, else no embedding for word, took too long to this -_-
    #the tqmd progress bar turns ugly.. prob cause of recursion
    """Will fail if '?' is placed infront of word.. e.g:  '?hello'   , but im not expecting that for this task"""
    x = str(x)
    if len(x.strip(' ')) ==0:
        return x
    pos = x[1:].find('?') +1 #skip the first char, incase first char is '?' to not insert space before it
    if pos == 0:
        pass
    else:
        x = x[:pos] + " " + x[pos:]
    if pos+1!= len(x)-1:
        back = split_qmark(x[pos+1:])
        x = x[:pos+1] + back
    
    
    return x       
        

In [25]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'Otto': 'bingo'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [23]:
train["question_text"] [3]

'How did Otto von Guericke used the Magdeburg hemispheres ?'

In [26]:
replace_typical_misspell(train["question_text"] [3])

'How did bingo von Guericke used the Magdeburg hemispheres ?'

In [29]:
def replace(match):
    return mispellings[match.group(0)]

In [47]:
mispellings.group(0)

AttributeError: 'dict' object has no attribute 'group'

In [40]:
mispellings_re

re.compile(r'(colour|centre|didnt|doesnt|isnt|shouldnt|favourite|travelling|counselling|theatre|cancelled|labour|organisation|wwii|citicise|instagram|whatsapp|Otto)',
re.UNICODE)

In [34]:
mispellings[match.group(0)]

NameError: name 'match' is not defined

In [14]:
train["question_text"] = train["question_text"].progress_apply(lambda x: split_qmark(x))

100%|██████████| 1306122/1306122 [00:03<00:00, 403437.81it/s]


In [15]:
sentences = train["question_text"].apply(lambda x: x.split())

In [16]:
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:03<00:00, 430209.30it/s]


In [17]:
oov = check_coverage(vocab,glove_embed)

100%|██████████| 435932/435932 [00:00<00:00, 551587.12it/s]


Found embeddings for 43.05% of vocab
Found embeddings for  96.33% of all text


In [18]:
oov[:100]

[("What's", 12427),
 ('so,', 2562),
 ('I’m', 2506),
 ("you've", 2418),
 ("isn't", 2335),
 ('"The', 2239),
 ('don’t', 2127),
 ("aren't", 1798),
 ('What’s', 1688),
 ("won't", 1648),
 ("Trump's", 1566),
 ('me,', 1310),
 ("Isn't", 1232),
 ("they're", 1164),
 ('(or', 1129),
 ("haven't", 1089),
 ('yes,', 1050),
 ('(in', 1040),
 ('can’t', 1015),
 ('(I', 983),
 ('India,', 948),
 ("what's", 943),
 ('win,', 918),
 ("he's", 902),
 ('(not', 891),
 ('?"', 886),
 ('life,', 877),
 ('time,', 865),
 ('it’s', 864),
 ('better,', 862),
 ('people,', 851),
 ('Quorans', 800),
 ('not,', 792),
 ("today's", 787),
 ("someone's", 783),
 ('doesn’t', 761),
 ('(like', 752),
 ('etc.)', 709),
 ("one's", 691),
 ('(and', 683),
 ('them,', 673),
 ("India's", 666),
 ('"I', 665),
 ("hasn't", 664),
 ("shouldn't", 659),
 ("people's", 643),
 ('100%', 636),
 ('years,', 618),
 ("wasn't", 613),
 ("couldn't", 610),
 ("there's", 604),
 ("wouldn't", 601),
 ('Quora,', 596),
 ('I’ve', 581),
 ('world,', 564),
 ('now,', 563),
 ('(for', 

In [178]:
split_qmark('it?')

'it ?'

In [181]:
'Why?' in glove_embed

False

In [19]:
',' in glove_embed

True

In [184]:
train["question_text"] [2]

'Why does velocity affect time ? Does velocity affect space geometry?'

In [103]:
def split_qmark2(x): #need to insert space before question marks, else no embedding for word, took too long to this -_-
    #the tqmd progress bar turns ugly.. prob cause of recursion
    x = str(x)
    if len(x.strip(' ')) ==0:
        return x
    pos = x[:].find('?')
    if pos == -1:
        pass
    else:
        x = x[:pos] + " " + x[pos:]
    if pos+1!= len(x)-1:
        back = split_qmark(x[pos+1:])
        x = x[:pos+1] + back
    
    
    return x  

In [82]:
text = "what? is? this?"

In [83]:
split_qmark(text)

'what ? is ? this ?'

In [99]:
text2= "?????????????" 

In [100]:
split_qmark(text2)

'? ? ? ? ? ? ? ? ? ? ? ? ?'

In [105]:
split_qmark2(text)

'what ? is ? this ?'

In [115]:
text3 = "?hey? what? hello???"

In [116]:
split_qmark(text3)

'?hey ? what ? hello ? ? ?'

In [117]:
split_qmark2(text3)

' ?hey ? what ? hello ? ? ?'