# Investigate Paragram embeddings
Looking into another word embedding

In [1]:
import operator 
from collections import defaultdict


import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
from IPython.display import display
pd.set_option('display.max_colwidth', -1)
from tqdm import tqdm
tqdm.pandas()

In [23]:
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(51)]:
        score = metrics.f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

## Import data

In [2]:
train = pd.read_csv('../../data/train.csv')
#test = pd.read_csv('../../data/test.csv')
print(train.shape)
#print(test.shape)
train.fillna('_na_', inplace=True)

(1306122, 3)


In [5]:
# Load Embedding file using google news word2vec
EMBEDDING_FILE = '../../data/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)


In [6]:
EMBEDDING_DIM = len(embeddings_index['word'])

# Preprocessing

In [7]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [8]:
def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    covered_word_count = 0
    oov_word_count = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            covered_word_count += vocab[word]
        except:

            oov[word] = vocab[word]
            oov_word_count += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(covered_word_count / (covered_word_count + oov_word_count)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [9]:
sentences = list(map(lambda sent: sent.split(), train["question_text"]))
vocab = build_vocab(sentences) 

oov = check_coverage(vocab, embeddings_index)

100%|████████████████████████████████████████████████████████████████████| 1306122/1306122 [00:04<00:00, 324464.57it/s]
100%|██████████████████████████████████████████████████████████████████████| 508823/508823 [00:00<00:00, 939993.88it/s]


Found embeddings for 19.59% of vocab
Found embeddings for  72.21% of all text


In [10]:
oov[:20]

[('What', 417802),
 ('I', 306261),
 ('How', 261930),
 ('Why', 142291),
 ('Is', 108887),
 ('Can', 52698),
 ('Which', 47303),
 ('Do', 40041),
 ('If', 34557),
 ('Are', 29201),
 ('Does', 23180),
 ('Who', 21981),
 ('Where', 19146),
 ('Should', 16591),
 ('India?', 16384),
 ('Will', 14669),
 ('When', 14483),
 ('India', 13685),
 ('it?', 12900),
 ('Indian', 12895)]

In [11]:
train["question_text"] = train["question_text"].progress_apply(lambda x: x.lower())

sentences = list(map(lambda sent: sent.split(), train["question_text"]))
vocab = build_vocab(sentences) 

oov = check_coverage(vocab, embeddings_index)

100%|████████████████████████████████████████████████████████████████████| 1306122/1306122 [00:01<00:00, 956526.27it/s]
100%|████████████████████████████████████████████████████████████████████| 1306122/1306122 [00:03<00:00, 341519.76it/s]
100%|██████████████████████████████████████████████████████████████████████| 450473/450473 [00:00<00:00, 971352.11it/s]


Found embeddings for 31.41% of vocab
Found embeddings for  88.22% of all text


In [12]:
oov[:20]

[('india?', 16394),
 ("what's", 13369),
 ('it?', 13158),
 ('do?', 8766),
 ('life?', 7791),
 ('why?', 7369),
 ('you?', 6314),
 ('me?', 6241),
 ('them?', 6141),
 ('time?', 5742),
 ('world?', 5525),
 ('people?', 5008),
 ('quora?', 4657),
 ('like?', 4490),
 ('for?', 4450),
 ('work?', 4219),
 ('2017?', 4050),
 ('mean?', 3980),
 ('2018?', 3594),
 ("isn't", 3509)]

In [14]:
# for symbol in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~/-&“”’':
#    print(symbol, symbol in embeddings_index)

In [15]:
def space_punct(x):
    x = str(x)
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~/-&“”’':
        x = x.replace(punct, f' {punct} ')
    return x

In [16]:
train["question_text"] = train["question_text"].apply(lambda x: space_punct(x))
sentences = train["question_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

oov = check_coverage(vocab, embeddings_index)

100%|████████████████████████████████████████████████████████████████████| 1306122/1306122 [00:03<00:00, 341541.16it/s]
100%|██████████████████████████████████████████████████████████████████████| 199047/199047 [00:00<00:00, 936839.55it/s]


Found embeddings for 73.15% of vocab
Found embeddings for  99.62% of all text


In [19]:
oov[:20]

[('quorans', 858),
 ('brexit', 524),
 ('cryptocurrencies', 499),
 ('redmi', 383),
 ('coinbase', 149),
 ('oneplus', 139),
 ('uceed', 123),
 ('demonetisation', 115),
 ('bhakts', 115),
 ('upwork', 111),
 ('machedo', 108),
 ('gdpr', 107),
 ('adityanath', 106),
 ('boruto', 102),
 ('bnbr', 100),
 ('alshamsi', 92),
 ('dceu', 90),
 ('litecoin', 87),
 ('iiest', 86),
 ('unacademy', 86)]