In [1]:
import torch
import pandas as pd
from tqdm import tqdm
import os
tqdm.pandas()
import sys

Based on https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings/notebook

In [2]:
base = os.getcwd()

In [3]:
embedding = os.path.join(base,'Base','glove.840B.300d','glove.840B.300d.txt')

In [4]:
trainfile = os.path.join(base,'Base','train.csv')
train = pd.read_csv(trainfile)

In [6]:
train.shape

(1306122, 3)

In [5]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [31]:
sentences = train["question_text"].progress_apply(lambda x: x.split()).values

100%|██████████| 1306122/1306122 [00:02<00:00, 470579.53it/s]


In [32]:
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:02<00:00, 461248.14it/s]


In [42]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec



In [43]:
glove2word2vec(embedding,"glove.txt")

(2196017, 300)

In [44]:
glove_embed = KeyedVectors.load_word2vec_format("glove.txt",binary=False)

In [38]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab): #if word from vocab in embedindex, assign to a and add to k
        try:
            a[word] = embeddings_index[word]
            k += vocab[word] #vocab is a dictonary of word counts
        except:# if word not in embed index, execute this loop

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [65]:
oov = check_coverage(vocab,glove_embed)

100%|██████████| 508823/508823 [00:00<00:00, 567489.65it/s]


Found embeddings for 33.16% of vocab
Found embeddings for  88.16% of all text


In [53]:
def split_qmark(x): #need to insert space before question marks, else no embedding for word, took too long to this -_-
    #the tqmd progress bar turns ugly.. prob cause of recursion
    x = str(x)
    if len(x.strip(' ')) ==0:
        return x
    pos = x[1:].find('?') +1
    if pos == 0:
        pass
    else:
        x = x[:pos] + " " + x[pos:]
    if pos+1!= len(x)-1:
        back = split_qmark(x[pos+1:])
        x = x[:pos+1] + back
    
    
    return x       
        

In [52]:
train["question_text"] = train["question_text"].progress_apply(lambda x: split_qmark(x))



  0%|          | 0/1306122 [00:00<?, ?it/s]

  2%|▏         | 31359/1306122 [00:00<00:04, 310818.04it/s]

  6%|▌         | 72048/1306122 [00:00<00:03, 333932.90it/s]

  9%|▊         | 111355/1306122 [00:00<00:03, 347624.91it/s]

 12%|█▏        | 152635/1306122 [00:00<00:03, 364327.96it/s]

 15%|█▍        | 194965/1306122 [00:00<00:02, 379437.34it/s]

 18%|█▊        | 236873/1306122 [00:00<00:02, 389722.56it/s]

 21%|██▏       | 278794/1306122 [00:00<00:02, 397569.50it/s]

 25%|██▍       | 320473/1306122 [00:00<00:02, 402290.90it/s]

 28%|██▊       | 363404/1306122 [00:00<00:02, 409515.22it/s]

 31%|███       | 403317/1306122 [00:01<00:02, 399893.77it/s]

 34%|███▍      | 445075/1306122 [00:01<00:02, 404980.01it/s]

 37%|███▋      | 488460/1306122 [00:01<00:01, 413084.82it/s]

 41%|████      | 531727/1306122 [00:01<00:01, 418312.75it/s]

 44%|████▍     | 574240/1306122 [00:01<00:01, 419128.91it/s]

 47%|████▋     | 616027/1306122 [00:01<00:01, 407824.19it/s]

 50%|█████     | 658422/1

In [37]:
sentences = train["question_text"].apply(lambda x: x.split())

In [39]:
vocab = build_vocab(sentences)



  0%|          | 0/1306122 [00:00<?, ?it/s]

  3%|▎         | 38456/1306122 [00:00<00:03, 381027.44it/s]

  7%|▋         | 87159/1306122 [00:00<00:02, 406898.68it/s]

 10%|█         | 134821/1306122 [00:00<00:02, 424743.39it/s]

 14%|█▍        | 183094/1306122 [00:00<00:02, 439740.11it/s]

 18%|█▊        | 231298/1306122 [00:00<00:02, 451237.42it/s]

 21%|██▏       | 279416/1306122 [00:00<00:02, 458820.56it/s]

 25%|██▍       | 325428/1306122 [00:00<00:02, 458207.89it/s]

 29%|██▊       | 373033/1306122 [00:00<00:02, 462431.43it/s]

 32%|███▏      | 420702/1306122 [00:00<00:01, 465573.05it/s]

 36%|███▌      | 468954/1306122 [00:01<00:01, 470102.06it/s]

 40%|███▉      | 516693/1306122 [00:01<00:01, 471850.44it/s]

 43%|████▎     | 564085/1306122 [00:01<00:01, 472057.20it/s]

 47%|████▋     | 611456/1306122 [00:01<00:01, 471519.45it/s]

 50%|█████     | 658644/1306122 [00:01<00:01, 470597.58it/s]

 54%|█████▍    | 706699/1306122 [00:01<00:01, 472511.14it/s]

 58%|█████▊    | 754530/1

In [45]:
oov = check_coverage(vocab,glove_embed)



  0%|          | 0/435932 [00:00<?, ?it/s]

 12%|█▏        | 52470/435932 [00:00<00:00, 520342.29it/s]

 24%|██▍       | 104993/435932 [00:00<00:00, 520620.16it/s]

 36%|███▌      | 157151/435932 [00:00<00:00, 520280.54it/s]

 48%|████▊     | 210455/435932 [00:00<00:00, 523092.71it/s]

 61%|██████    | 265072/435932 [00:00<00:00, 529156.97it/s]

 73%|███████▎  | 319648/435932 [00:00<00:00, 533779.56it/s]

 86%|████████▌ | 372839/435932 [00:00<00:00, 532201.66it/s]

 98%|█████████▊| 427300/435932 [00:00<00:00, 535223.06it/s]

100%|██████████| 435932/435932 [00:00<00:00, 530852.23it/s]

Found embeddings for 43.05% of vocab
Found embeddings for  96.33% of all text


In [49]:
oov[:10]

[("What's", 12427),
 ('so,', 2562),
 ('I’m', 2506),
 ("you've", 2418),
 ("isn't", 2335),
 ('"The', 2239),
 ('don’t', 2127),
 ("aren't", 1798),
 ('What’s', 1688),
 ("won't", 1648)]

In [178]:
split_qmark('it?')

'it ?'

In [181]:
'Why?' in glove_embed

False

In [184]:
train["question_text"] [2]

'Why does velocity affect time ? Does velocity affect space geometry?'

In [2]:
text = 'what? is? this?'
len(text)

15

In [188]:
listx=[]
for idx,x in enumerate(text):
    if x== '?':
        print(idx)
    

4
8
14


In [32]:
def split_qmark(x):
    if len(x.strip(' ')) ==0:
        return x
    x = str(x)
    pos = x[1:].find('?') +1
    if pos == 0:
        pass
    else:
        x = x[:pos] + " " + x[pos:]
    if pos+1!= len(x)-1:
        back = split_qmark(x[pos+1:])
        x = x[:pos+1] + back
    
    
    return x   

In [48]:
text2 = '???????!?'
split_qmark(text2)


'? ? ? ? ? ? ?! ?'

In [16]:
split_qmark(train["question_text"][2])

'Why does velocity affect time ? Does velocity affect space geometry ?'

In [17]:
split_qmark(text)

'what ? is ? this ?'

In [194]:
text2 = text[:4] + " " + text[4:]

In [13]:
da = 'asd???asd??????????'

In [14]:
da.find('?')

3

In [203]:
text[5:]

' is? this?'

In [204]:
text[5]

' '

In [255]:
split_qmark(text)

what?
  is?
  this?


'what  is  this '

In [207]:
text[1:].find('?')

3

In [221]:
text.find('asdada')

-1

In [223]:
'?' in text

True

ValueError: empty separator