# Tet classification for insincere Quora questions
(inspired by: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings)

Step 1 - pre-processing; the point here is to not use standard pre=processing steps but instead make sure that there is as much overlap between the word embeddings and your vocabulary.

In [0]:
###imports
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [33]:
###mount drive
from google.colab import drive
import os
drive.mount('/content/gdrive')

###change directory
os.chdir('/content/gdrive/My Drive/Colab Notebooks/quora')




Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [34]:
###Data set explore
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.iloc[0:10]

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
5,00004f9a462a357c33be,"Is Gaza slowly becoming Auschwitz, Dachau or T...",0
6,00005059a06ee19e11ad,Why does Quora automatically ban conservative ...,0
7,0000559f875832745e2e,Is it crazy if I wash or wipe my groceries off...,0
8,00005bd3426b2d0c8305,"Is there such a thing as dressing moderately, ...",0
9,00006e6928c5df60eacb,Is it just me or have you ever been in this ph...,0


The below function builds the training vocabulary dictionary, going through all the sentences and counts the occurances of the contained words.

In [0]:
###build vocab dictionary function
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [37]:
###build vocab
#split sentences into separate words
sentences = train["question_text"].progress_apply(lambda x: x.split()).values
#run vocab function
vocab = build_vocab(sentences)
#print first 5 elements of dictionary
print({k: vocab[k] for k in list(vocab)[:5]})

100%|██████████| 1306122/1306122 [00:04<00:00, 296643.56it/s]
100%|██████████| 1306122/1306122 [00:05<00:00, 226706.76it/s]


{'How': 261930, 'did': 33489, 'Quebec': 97, 'nationalists': 91, 'see': 9003}


In [27]:
###import google news embeddings
from gensim.models import KeyedVectors
#change directory
os.chdir('/content/gdrive/My Drive/Colab Notebooks/album_reviews')
news_path = 'GoogleNews-vectors-negative300.bin.gz'
#
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
###a function to check the intersection between bocab and embeddings
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        #try to assign word from embedding to new dict with index value
        #add number of found words to k
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        #otherwise add word count value to oov dict word key
        #add number of unfound words to i
        except:
            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [39]:
###run vocab function
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 508823/508823 [00:01<00:00, 320485.92it/s]


Found embeddings for  78.75% of all text


In [40]:
oov[:10]

[('to', 403183),
 ('a', 402682),
 ('of', 330825),
 ('and', 251973),
 ('India?', 16384),
 ('it?', 12900),
 ('do?', 8753),
 ('life?', 7753),
 ('you?', 6295),
 ('me?', 6202)]

## Pre-processing steps
1.   Remove punctuation not in embeddings
2.   Change numbers of 2 or more digits to hashes



In [0]:
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [0]:
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_text(x))
sentences = train["question_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

In [44]:
oov = check_coverage(vocab,embeddings_index)


  0%|          | 0/253623 [00:00<?, ?it/s][A
  8%|▊         | 21354/253623 [00:00<00:01, 213537.66it/s][A
 20%|█▉        | 49849/253623 [00:00<00:00, 230897.39it/s][A
 31%|███       | 77590/253623 [00:00<00:00, 243126.47it/s][A
 42%|████▏     | 106482/253623 [00:00<00:00, 255262.98it/s][A
 53%|█████▎    | 134374/253623 [00:00<00:00, 261926.73it/s][A
 65%|██████▍   | 164699/253623 [00:00<00:00, 273089.81it/s][A
 77%|███████▋  | 194760/253623 [00:00<00:00, 280800.42it/s][A
 89%|████████▊ | 224488/253623 [00:00<00:00, 285546.34it/s][A
100%|██████████| 253623/253623 [00:00<00:00, 282949.00it/s][A

Found embeddings for  89.99% of all text


In [45]:
oov[:10]

[('to', 406298),
 ('a', 403852),
 ('of', 332964),
 ('and', 254081),
 ('2017', 8781),
 ('2018', 7373),
 ('10', 6642),
 ('12', 3694),
 ('20', 2942),
 ('100', 2883)]

In [0]:
import re

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [48]:
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_numbers(x))
sentences = train["question_text"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)


  0%|          | 0/1306122 [00:00<?, ?it/s][A
  0%|          | 4588/1306122 [00:00<00:28, 45877.75it/s][A
  1%|          | 12946/1306122 [00:00<00:24, 53057.16it/s][A
  2%|▏         | 21802/1306122 [00:00<00:21, 60310.49it/s][A
  2%|▏         | 30590/1306122 [00:00<00:19, 66575.45it/s][A
  3%|▎         | 39406/1306122 [00:00<00:17, 71852.54it/s][A
  4%|▎         | 48537/1306122 [00:00<00:16, 76756.67it/s][A
  4%|▍         | 56900/1306122 [00:00<00:15, 78694.68it/s][A
  5%|▌         | 65445/1306122 [00:00<00:15, 80605.72it/s][A
  6%|▌         | 74650/1306122 [00:00<00:14, 83728.00it/s][A
  6%|▋         | 83407/1306122 [00:01<00:14, 84843.52it/s][A
  7%|▋         | 92629/1306122 [00:01<00:13, 86928.30it/s][A
  8%|▊         | 101873/1306122 [00:01<00:13, 88510.96it/s][A
  9%|▊         | 111187/1306122 [00:01<00:13, 89848.73it/s][A
  9%|▉         | 120437/1306122 [00:01<00:13, 90627.47it/s][A
 10%|▉         | 129519/1306122 [00:01<00:13, 90281.47it/s][A
 11%|█         | 13

In [49]:
oov = check_coverage(vocab,embeddings_index)



  0%|          | 0/242997 [00:00<?, ?it/s][A
 11%|█         | 26921/242997 [00:00<00:00, 269203.20it/s][A
 22%|██▏       | 53882/242997 [00:00<00:00, 269324.42it/s][A
 33%|███▎      | 80995/242997 [00:00<00:00, 269862.29it/s][A
 45%|████▌     | 110363/242997 [00:00<00:00, 276590.24it/s][A
 57%|█████▋    | 138248/242997 [00:00<00:00, 277262.42it/s][A
 69%|██████▉   | 167945/242997 [00:00<00:00, 282890.69it/s][A
 81%|████████  | 196441/242997 [00:00<00:00, 283506.99it/s][A
 93%|█████████▎| 225869/242997 [00:00<00:00, 286652.84it/s][A
100%|██████████| 242997/242997 [00:00<00:00, 280729.18it/s][A

Found embeddings for  90.75% of all text
