# Setup Workspace

In [52]:
import re
import nltk
import string

import pandas as pd

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm.auto import tqdm

In [53]:
tqdm.pandas()
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
STOP_WORDS = stopwords.words('english')
LEMMATIZER = WordNetLemmatizer()

In [54]:
NGRAMS_LENGTH = 3

In [55]:
df = None
%store -r df

# Data Engineering

### Punctuation

In [56]:
df['q1_preprocessed'] = df['question1'].str.translate(str.maketrans('', '', string.punctuation))
df['q2_preprocessed'] = df['question2'].str.translate(str.maketrans('', '', string.punctuation))

### Numbers

In [57]:
df['q1_preprocessed'] = df['q1_preprocessed'].progress_apply(lambda sentence: re.sub(r'[0-9]+', '', sentence))
df['q2_preprocessed'] = df['q2_preprocessed'].progress_apply(lambda sentence: re.sub(r'[0-9]+', '', sentence))

  0%|          | 0/404012 [00:00<?, ?it/s]

  0%|          | 0/404012 [00:00<?, ?it/s]

### Tokenization

In [58]:
df['q1_preprocessed'] = df['q1_preprocessed'].progress_apply(lambda sentence: word_tokenize(sentence))
df['q2_preprocessed'] = df['q2_preprocessed'].progress_apply(lambda sentence: word_tokenize(sentence))

  0%|          | 0/404012 [00:00<?, ?it/s]

  0%|          | 0/404012 [00:00<?, ?it/s]

### Lowercase

In [59]:
df['q1_preprocessed'] = df['q1_preprocessed'].progress_apply(lambda x: [word.lower() for word in x])
df['q2_preprocessed'] = df['q2_preprocessed'].progress_apply(lambda x: [word.lower() for word in x])

  0%|          | 0/404012 [00:00<?, ?it/s]

  0%|          | 0/404012 [00:00<?, ?it/s]

### Stop words

In [60]:
df['q1_preprocessed'] = df['q1_preprocessed'].progress_apply(lambda x: [
    word 
    for word in x 
    if word not in STOP_WORDS
])

df['q2_preprocessed'] = df['q2_preprocessed'].progress_apply(lambda x: [
    word 
    for word in x 
    if word not in STOP_WORDS
])

  0%|          | 0/404012 [00:00<?, ?it/s]

  0%|          | 0/404012 [00:00<?, ?it/s]

In [61]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_length,q2_length,q1_special_chars,q2_special_chars,q1_stopwords,q2_stopwords,common_words,common_words_count,questions_combined,q1_preprocessed,q2_preprocessed
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,66,57,14,12,7,6,"{step, the, invest, guide, in, by, share, to, ...",10,What is the step by step guide to invest in sh...,"[step, step, guide, invest, share, market, india]","[step, step, guide, invest, share, market]"
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,51,88,12,17,4,4,"{what, the, (koh-i-noor), kohinoor}",4,What is the story of Kohinoor (Koh-i-Noor) Dia...,"[story, kohinoor, kohinoor, diamond]","[would, happen, indian, government, stole, koh..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,73,59,14,10,8,5,"{speed, internet, can, how}",4,How can I increase the speed of my internet co...,"[increase, speed, internet, connection, using,...","[internet, speed, increased, hacking, dns]"
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,50,65,12,18,7,4,{},0,Why am I mentally very lonely? How can I solve...,"[mentally, lonely, solve]","[find, remainder, mathmath, divided]"
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,76,39,15,7,3,2,"{in, which}",2,"Which one dissolve in water quikly sugar, salt...","[one, dissolve, water, quikly, sugar, salt, me...","[fish, would, survive, salt, water]"


Actually, this takes a while, so now it is commented.

### Lemmatization

In [62]:
df['q1_preprocessed'] = df['q1_preprocessed'].progress_apply(lambda x: [LEMMATIZER.lemmatize(word) for word in x])
df['q2_preprocessed'] = df['q2_preprocessed'].progress_apply(lambda x: [LEMMATIZER.lemmatize(word) for word in x])

  0%|          | 0/404012 [00:00<?, ?it/s]

  0%|          | 0/404012 [00:00<?, ?it/s]

### N-grams

In [63]:
def get_ngrams(tokens, length):
    ngrams = []
    if len(tokens) < length:
        return ngrams
    ngrams.extend(nltk.ngrams(tokens, length))
    return ngrams

In [64]:
df['q1_ngrams'] = df['q1_preprocessed'].progress_apply(lambda sentence: get_ngrams(sentence, NGRAMS_LENGTH))
df['q2_ngrams'] = df['q2_preprocessed'].progress_apply(lambda sentence: get_ngrams(sentence, NGRAMS_LENGTH))

  0%|          | 0/404012 [00:00<?, ?it/s]

  0%|          | 0/404012 [00:00<?, ?it/s]

### Save Results

In [65]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_length,q2_length,q1_special_chars,q2_special_chars,q1_stopwords,q2_stopwords,common_words,common_words_count,questions_combined,q1_preprocessed,q2_preprocessed,q1_ngrams,q2_ngrams
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,66,57,14,12,7,6,"{step, the, invest, guide, in, by, share, to, ...",10,What is the step by step guide to invest in sh...,"[step, step, guide, invest, share, market, india]","[step, step, guide, invest, share, market]","[(step, step, guide), (step, guide, invest), (...","[(step, step, guide), (step, guide, invest), (..."
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,51,88,12,17,4,4,"{what, the, (koh-i-noor), kohinoor}",4,What is the story of Kohinoor (Koh-i-Noor) Dia...,"[story, kohinoor, kohinoor, diamond]","[would, happen, indian, government, stole, koh...","[(story, kohinoor, kohinoor), (kohinoor, kohin...","[(would, happen, indian), (happen, indian, gov..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,73,59,14,10,8,5,"{speed, internet, can, how}",4,How can I increase the speed of my internet co...,"[increase, speed, internet, connection, using,...","[internet, speed, increased, hacking, dns]","[(increase, speed, internet), (speed, internet...","[(internet, speed, increased), (speed, increas..."
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,50,65,12,18,7,4,{},0,Why am I mentally very lonely? How can I solve...,"[mentally, lonely, solve]","[find, remainder, mathmath, divided]","[(mentally, lonely, solve)]","[(find, remainder, mathmath), (remainder, math..."
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,76,39,15,7,3,2,"{in, which}",2,"Which one dissolve in water quikly sugar, salt...","[one, dissolve, water, quikly, sugar, salt, me...","[fish, would, survive, salt, water]","[(one, dissolve, water), (dissolve, water, qui...","[(fish, would, survive), (would, survive, salt..."


In [66]:
%store df

Stored 'df' (DataFrame)
