In [6]:
import pandas as pd
import numpy as np
import operator
import re
from tqdm import tqdm

tqdm.pandas()

# Pre-Processing Notebook: Using Word Embeddings

## Loading Data

In [7]:
train = pd.read_csv('train.csv').drop('target', axis=1) # Drop the 'Target' Feature/Column, so dimensions match when concatenating
test = pd.read_csv('test.csv')

df = pd.concat([train ,test])

print('Training set size: ', train.shape[0])
print('Testing set size: ', test.shape[0])
print('The total number of texts is: ', df.shape[0])

print('Proportion of Training set to Total Set: ', f"{(train.shape[0])/(train.shape[0]+test.shape[0])*100} %")

Training set size:  1306122
Testing set size:  56370
The total number of texts is:  1362492
Proportion of Training set to Total Set:  95.86272800133872 %


In [8]:
train.head()

Unnamed: 0,qid,question_text
0,00002165364db923c7e6,How did Quebec nationalists see their province...
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco..."
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...


## Creating Vocabulary
i.e. Python Dictionary containing the occurrences of words in our training set

In [9]:
def create_vocabulary(sentences):
    vocab = {}
    for sentence in tqdm(sentences):
        for word in sentence:
            try:
                vocab[word] += 1
            except:
                vocab[word] = 1
    return vocab

In [10]:
questions = train['question_text'].progress_apply(lambda x: x.split()).values
vocab = create_vocabulary(questions)

100%|██████████| 1306122/1306122 [00:06<00:00, 215820.98it/s]
100%|██████████| 1306122/1306122 [00:04<00:00, 310036.62it/s]


## Let's verify that it worked by checking the count of the first 5 elements.

In [11]:
print({k: vocab[k] for k in list(vocab)[:5]})

{'How': 261930, 'did': 33489, 'Quebec': 97, 'nationalists': 91, 'see': 9003}


## Function for importing embeddings

In [12]:
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    if file == '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file) if len(o)>100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

In [13]:
from gensim.models import KeyedVectors

In [14]:
glove_path = 'embeddings/glove.840B.300d/glove.840B.300d.txt'
google_news_path ='embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin' 
wiki_news_path = 'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
paragram_path = 'embeddings/paragram_300_sl999/paragram_300_sl999.txt'

In [15]:
embed_google = KeyedVectors.load_word2vec_format(google_news_path, binary=True)

In [16]:
print('Embed glove loaded.')
embed_glove = load_embed(glove_path)
#embed_google = load_embed(google_news_path)
print('Embed wiki loaded.')
embed_wiki = load_embed(wiki_news_path)
print('Embed paragram loaded.')
embed_paragram = load_embed(paragram_path)

Embed glove loaded.
Embed wiki loaded.
Embed paragram loaded.


In [17]:
def check_coverage(embedding, vocab):
    known_words = {}
    unknown_words = {}
    number_known_words = 0
    number_unknown_words = 0
    if embedding == embed_google:
        for word in tqdm(vocab):
            try:
                known_words[word] = embedding[word]
                number_known_words += vocab[word]
            except:
                unknown_words[word] = vocab[word]
                number_unknown_words += vocab[word]
        print(f"Percentage of embeddings for Vocab is: {(len(known_words)/len(vocab))*100}%")
        print(f"Percentage of embeddings for Text is: {(number_known_words/(number_known_words + number_unknown_words))*100}%")
    else:
        for word in tqdm(vocab.keys()):
            try:
                known_words[word] = embedding[word]
                number_known_words += vocab[word]
            except:
                unknown_words[word] = vocab[word]
                number_unknown_words += vocab[word]
        print(f"Percentage of embeddings for Vocab is: {(len(known_words)/len(vocab))*100}%")
        print(f"Percentage of embeddings for Text is: {(number_known_words/(number_known_words + number_unknown_words))*100}%")

    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]
    
    return unknown_words

In [18]:
print('Checking coverage for Google.')
unknown_words_google = check_coverage(embed_google, vocab)
print('Checking coverage for Glove.')
unknown_words_glove = check_coverage(embed_glove, vocab)
print('Checking coverage for Wiki.')
unknown_words_wiki = check_coverage(embed_wiki, vocab)
print('Checking coverage for Paragram.')
unknown_words_para = check_coverage(embed_paragram, vocab)

  0%|          | 114/508823 [00:00<07:30, 1130.12it/s]

Checking coverage for Google.


100%|██████████| 508823/508823 [00:29<00:00, 17234.34it/s]
  0%|          | 162/508823 [00:00<05:16, 1607.88it/s]

Percentage of embeddings for Vocab is: 24.308453037696804%
Percentage of embeddings for Text is: 78.74644592896665%
Checking coverage for Glove.


100%|██████████| 508823/508823 [00:21<00:00, 23290.55it/s]
  0%|          | 0/508823 [00:00<?, ?it/s]

Percentage of embeddings for Vocab is: 33.0242540136747%
Percentage of embeddings for Text is: 88.14782041294316%
Checking coverage for Wiki.


100%|██████████| 508823/508823 [00:08<00:00, 58627.95it/s] 
  0%|          | 261/508823 [00:00<03:16, 2594.69it/s]

Percentage of embeddings for Vocab is: 29.863233383711034%
Percentage of embeddings for Text is: 87.64399563812303%
Checking coverage for Paragram.


100%|██████████| 508823/508823 [00:12<00:00, 40295.63it/s]


Percentage of embeddings for Vocab is: 19.54097986922761%
Percentage of embeddings for Text is: 72.20571143729778%


### The best results are obtained using the Glove embedding, however notice that approximately only 30% of our Vocab has an embedding. In other words, a bit over 10% of our data (100% - 88.147%) is useless. Let's see if we can fix that by investigating the 'unknown_words'.

In [19]:
unknown_words_glove[:20]

[('India?', 16384),
 ('it?', 12900),
 ("What's", 12425),
 ('do?', 8753),
 ('life?', 7753),
 ('you?', 6295),
 ('me?', 6202),
 ('them?', 6140),
 ('time?', 5716),
 ('world?', 5386),
 ('people?', 4971),
 ('why?', 4943),
 ('Quora?', 4655),
 ('like?', 4487),
 ('for?', 4450),
 ('work?', 4206),
 ('2017?', 4050),
 ('mean?', 3971),
 ('2018?', 3594),
 ('country?', 3422)]

### It seems like the punctuation marks are posing a problem for our embedding model. Let's take care of that.
### Notice that we can include punctuation marks, since they are handled by Glove embedding:

In [20]:
'?' in embed_glove

True

In [21]:
'!' in embed_glove

True

In [22]:
'#' in embed_glove

True

In [23]:
'@' in embed_glove

True

In [24]:
'$' in embed_glove

True

In [25]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

def check_punctuation(embedding, punct):
    unknown_punct = ''
    known_punct = ''
    for p in punct:
        if p in embedding:
            known_punct += f'{p} '
        else:
            unknown_punct += f'{p} '
    return known_punct, unknown_punct

In [26]:
known_punct, unknown_punct = check_punctuation(embed_glove, punct)

In [27]:
print(f'Known symbols: {known_punct}')
print(f'Unknown symbols: {unknown_punct}')

Known symbols: / - ' ? ! . , # $ % ' ( ) * + - / : ; < = > @ [ \ ] ^ _ ` { | } ~ " " \ & 
Unknown symbols: “ ” ’ ∞ θ ÷ α • à − β ∅ ³ π ‘ ₹ ´ ° £ € × ™ √ ² — – 


### Notice that several punctuation marks have 'semantic' synonyms that we could use in order to avoid losing valuable information in our representation. For example, € is typically used to denote a monetary amount, so we could replace occurences of this symbol with \\$

In [28]:
symbol_mapping = {"‘": "'", "₹": "$", "´": "'", "°": "degrees", "€": "$", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "$", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': 'empty', '³': '3', 'π': 'pi', }

def handle_punctuation(text, symbol_mapping, punct):
    
    for symbol in symbol_mapping:
        text = text.replace(symbol, symbol_mapping[symbol])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    return text

In [29]:
train["question_text"] = train["question_text"].progress_apply(lambda x: handle_punctuation(x, symbol_mapping, punct))

100%|██████████| 1306122/1306122 [00:28<00:00, 46123.93it/s]


In [30]:
train.head()

Unnamed: 0,qid,question_text
0,00002165364db923c7e6,How did Quebec nationalists see their province...
1,000032939017120e6e44,"Do you have an adopted dog , how would you en..."
2,0000412ca6e4628ce2cf,Why does velocity affect time ? Does velocity...
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...


In [31]:
questions = train["question_text"].progress_apply(lambda x: x.split()).values
vocab = create_vocabulary(questions)

100%|██████████| 1306122/1306122 [00:21<00:00, 60590.83it/s] 
100%|██████████| 1306122/1306122 [00:04<00:00, 303912.29it/s]


In [32]:
unknown_words_glove = check_coverage(embed_glove, vocab)

100%|██████████| 240550/240550 [00:04<00:00, 52967.59it/s]


Percentage of embeddings for Vocab is: 74.15547703180212%
Percentage of embeddings for Text is: 99.56469519430303%


### Major improvements! From 33% of embeddings for our Vocab to 74%, and from 88% of embeddings for all of our Text to 99.5%.

In [33]:
len(unknown_words_glove)

62169

In [34]:
unknown_words_glove[:100]

[('Quorans', 856),
 ('Brexit', 493),
 ('cryptocurrencies', 481),
 ('Redmi', 379),
 ('…', 267),
 ('OnePlus', 125),
 ('UCEED', 123),
 ('Blockchain', 112),
 ('GDPR', 107),
 ('demonetisation', 106),
 ('Pokémon', 106),
 ('Coinbase', 105),
 ('BNBR', 99),
 ('Adityanath', 99),
 ('Machedo', 99),
 ('Boruto', 93),
 ('ethereum', 89),
 ('DCEU', 89),
 ('IIEST', 85),
 ('Qoura', 79),
 ('SJWs', 79),
 ('fiancé', 70),
 ('Upwork', 70),
 ('LNMIIT', 67),
 ('Zerodha', 65),
 ('Kavalireddi', 65),
 ('etc…', 63),
 ('bhakts', 63),
 ('Doklam', 62),
 ('NICMAR', 59),
 ('Vajiram', 59),
 ('Unacademy', 58),
 ('MUOET', 56),
 ('chsl', 55),
 ('HackerRank', 52),
 ('AlShamsi', 52),
 ('Bhakts', 51),
 ('Litecoin', 48),
 ('Awdhesh', 48),
 ('Cryptocurrency', 47),
 ('Jiren', 47),
 ('eLitmus', 47),
 ('altcoin', 45),
 ('altcoins', 45),
 ('Ryzen', 45),
 ('coinbase', 44),
 ('Baahubali', 44),
 ('SRMJEE', 43),
 ('Beerus', 41),
 ('Skripal', 40),
 ('SGSITS', 40),
 ('bahubali', 38),
 ('Binance', 37),
 ('Zebpay', 37),
 ('BMSCE', 37),
 ('W

In [35]:
'Pokemon' in embed_glove, 'Hackerrank' in embed_glove

(True, False)

## Handle accentuations (Pokémon), acronyms (GDPR), mispellings, capital letters

In [36]:
def add_upper_lower(embed, vocab):
    count_lower = 0
    count_upper = 0
    for word in vocab:
        if word in embed and word.lower() not in embed:
            embed[word.lower()] = embed[word]
            count_lower += 1
        elif word.lower() in embed and word not in embed:
            embed[word] = embed[word.lower()]
            count_upper += 1
    print(f'Added {count_lower} lowercase words to the embedding.')
    print(f'Added {count_upper} uppercase words to the embedding.')

In [37]:
add_upper_lower(embed_glove, vocab)

Added 17625 lowercase words to the embedding.
Added 2334 uppercase words to the embedding.


In [38]:
unknown_words_glove = check_coverage(embed_glove, vocab)

100%|██████████| 240550/240550 [00:00<00:00, 673183.05it/s]

Percentage of embeddings for Vocab is: 75.83205154853461%
Percentage of embeddings for Text is: 99.59876270684337%





In [44]:
unknown_words_glove[:10]

[('Quorans', 856),
 ('Brexit', 493),
 ('cryptocurrencies', 481),
 ('Redmi', 379),
 ('…', 267),
 ('OnePlus', 125),
 ('UCEED', 123),
 ('GDPR', 107),
 ('demonetisation', 106),
 ('Pokémon', 106)]

In [40]:
synonym_mapping = {'cryptocurrencies': 'cryptocurrency', 'qoura': 'quora', 'Pokémon': 'pokemon',  }

In [41]:
mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

In [42]:
'demonetization' in embed_glove

True

In [None]:
def prec():
    ## fill up the missing values
    train_X = train_df["question_text"].fillna("_##_").values
    test_X = test_df["question_text"].fillna("_##_").values

    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    ## Get the target values
    train_y = train_df['target'].values
    
    #shuffling the data
    np.random.seed(2018)
    trn_idx = np.random.permutation(len(train_X))

    train_X = train_X[trn_idx]
    train_y = train_y[trn_idx]
    
    return train_X, test_X, train_y, tokenizer.word_index

In [3]:
# Scipy Stacks
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split # To split our training set into training/validation sets.
from sklearn import metrics

# We are performing a sequential neural network.
from keras.models import Sequential

# Used to process text data.
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# What layers will be involved with our neural network.
from keras.layers import Dense, LSTM, Embedding, Dropout, Activation, GRU, LSTM, Conv1D, MaxPooling1D, Bidirectional, GlobalMaxPool1D

Using TensorFlow backend.
