# Setup Workspace

In [None]:
import re
import string
import torch
import fasttext
import argparse
import fasttext.util
import pandas as pd

from torch.utils.data import DataLoader
from torchtext.vocab import GloVe
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm.auto import tqdm
from tqdm.notebook import tqdm

In [None]:
STOP_WORDS = stopwords.words('english')
LEMMATIZER = WordNetLemmatizer()
GLOVE = GloVe(name="6B", dim=50)
COS = torch.nn.CosineSimilarity(dim=0)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
TEST_SIZE = 0.2
VAL_SIZE = 0.2
RANDOM_SEED = 420

MAX_EPOCHS = 15
VEC_SIZE = 300
ALPHA = 0.025
BATCH_SIZE = 512
MAX_WORDS = 50

In [None]:
torch.manual_seed(RANDOM_SEED)
df = None
try:
    %store -r df
except KeyError:
    df = pd.read_csv('formated_dataframe.csv')

# Model Selection [Word Embeddings]

### Splitting

In [242]:
data = df.drop(columns=['id', 'qid1', 'qid2', 'questions_combined'])

In [54]:
train_val, test = train_test_split(data, test_size=TEST_SIZE, random_state=RANDOM_SEED)
train, val = train_test_split(train_val, test_size=VAL_SIZE, random_state=RANDOM_SEED)

### Creating the Dataloaders

In [313]:
def create_collate_batch_fn(emb_func):
    def f(batch):
        label_list = []
        embeddings_tensor = torch.zeros(len(batch), 2, max_words, embed_len)
        for i, data in enumerate(batch):
            q1, q2, label, q1_length, q2_length, q1_special_chars, q2_special_chars, q1_stopwords, \
                q2_stopwords, common_words, common_words_count, q1_preprocessed, q2_preprocessed, q1_ngrams, q2_ngrams = data

            if len(q1_preprocessed) < max_words:
                q1_preprocessed += [""] * (max_words - len(q1_preprocessed))
            else:
                q1_preprocessed = q1_preprocessed[:max_words]

            if len(q2_preprocessed) < max_words:
                q2_preprocessed += [""] * (max_words - len(q2_preprocessed))
            else:
                q2_preprocessed = q2_preprocessed[:max_words]

            label_list.append(int(label))
            embeddings_tensor[i, 0] = emb_func(q1_preprocessed)
            embeddings_tensor[i, 1] = emb_func(q2_preprocessed)
        label_list = torch.tensor(label_list, dtype=torch.int64)
        return embeddings_tensor.reshape(len(batch), 2, -1).to(DEVICE), label_list.to(DEVICE)

    return f

In [314]:
def create_collate_batch_fn_compare(emb_func):
    def f(batch):
        embeddings_tensor = torch.zeros(len(batch) * 2, max_words, embed_len)
        counter = 0
        for i, data in enumerate(batch):
            q1, q2, label, q1_length, q2_length, q1_special_chars, q2_special_chars, q1_stopwords, \
                q2_stopwords, common_words, common_words_count, q1_preprocessed, q2_preprocessed, q1_ngrams, q2_ngrams = data

            if len(q1_preprocessed) < max_words:
                q1_preprocessed += [""] * (max_words - len(q1_preprocessed))
            else:
                q1_preprocessed = q1_preprocessed[:max_words]

            if len(q2_preprocessed) < max_words:
                q2_preprocessed += [""] * (max_words - len(q2_preprocessed))
            else:
                q2_preprocessed = q2_preprocessed[:max_words]
            embeddings_tensor[counter] = emb_func(q1_preprocessed)
            counter += 1
            embeddings_tensor[counter] = emb_func(q2_preprocessed)
            counter += 1
        return embeddings_tensor.to(DEVICE)

    return f

In [320]:
def create_single_emb(emb_func):
    def f(prep):
        if len(prep) < max_words:
            prep += [""] * (max_words - len(prep))
        else:
            prep = prep[:max_words]

        return emb_func(prep).to(DEVICE)

    return f

## Find most similar question

In [312]:
def cosine_similarity(q1, q2):
    return COS(q1.reshape(-1), q2.reshape(-1)).item()

In [212]:
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()


def preprocess_single_str(s):
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = re.sub(r'[0-9]+', '', s)
    s = word_tokenize(s)
    s = [word.lower() for word in s]
    s = [word for word in s if word not in (stop)]
    s = [lemmatizer.lemmatize(word) for word in s]
    return s

In [213]:
def get_q_from_batch_index(index):
    col = index % 2
    row = index // 2
    if col == 0:
        return data.iloc[row]["question1"]
    return data.iloc[row]["question2"]

In [332]:
def find_most_similar_question(user_request):
    user_request_preprocessed = preprocess_single_str(user_request)
    user_emb = single_emb(user_request_preprocessed)

    max_score = None
    best_index = None
    counter = 0

    for batch in tqdm(dataloader):
        for emb in batch:
            cos_sim = cosine_similarity(emb, user_emb)
            if max_score is None or cos_sim > max_score:
                max_score = cos_sim
                best_index = counter
            counter += 1

    return get_q_from_batch_index(best_index), max_score

## Testing Glove

In [368]:
def get_glove_embedding(q):
    return GLOVE.get_vecs_by_tokens(q)

In [370]:
collate_batch = create_collate_batch_fn_compare(get_glove_embedding)
single_emb = create_single_emb(get_glove_embedding)

In [371]:
dataloader = DataLoader(
    data.to_numpy(), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)

In [372]:
find_most_similar_question("Do you want to go to a bar?")

  0%|          | 0/790 [00:00<?, ?it/s]

('Does he want to go out with me?', 0.8569372296333313)

In [70]:
find_most_similar_question("Do you love me?")

('Why do you love her?', 0.9999998807907104)

In [None]:
find_most_similar_question("How much money do you have?")

## Testing FastText

Code below allows to download pretrained fasttext model

In [72]:
def command_download(lang_id, if_exists):
    """
        Download pre-trained common-crawl vectors from fastText's website
        https://fasttext.cc/docs/en/crawl-vectors.html
    """
    fasttext.util.download_model(lang_id, if_exists)


from IPython.utils import io

with io.capture_output() as captured:
    command_download("en", if_exists='ignore')

In [348]:
fasttext_model = fasttext.load_model("cc.en.300.bin")



In [381]:
max_words = 300

In [382]:
def get_fasttext_embedding(q):
    return torch.Tensor(fasttext_model.get_sentence_vector(" ".join(q)))

In [383]:
def collate_batch(batch):
    embeddings_tensor = torch.zeros(len(batch) * 2, max_words)
    counter = 0
    for i, data in enumerate(batch):
        q1, q2, label, q1_length, q2_length, q1_special_chars, q2_special_chars, q1_stopwords, \
            q2_stopwords, common_words, common_words_count, q1_preprocessed, q2_preprocessed, q1_ngrams, q2_ngrams = data

        if len(q1_preprocessed) < max_words:
            q1_preprocessed += [""] * (max_words - len(q1_preprocessed))
        else:
            q1_preprocessed = q1_preprocessed[:max_words]

        if len(q2_preprocessed) < max_words:
            q2_preprocessed += [""] * (max_words - len(q2_preprocessed))
        else:
            q2_preprocessed = q2_preprocessed[:max_words]
        embeddings_tensor[counter] = get_fasttext_embedding(q1_preprocessed)
        counter += 1
        embeddings_tensor[counter] = get_fasttext_embedding(q2_preprocessed)
        counter += 1
    return embeddings_tensor.to(DEVICE)

In [384]:
single_emb = create_single_emb(get_fasttext_embedding)

In [385]:
dataloader = DataLoader(
    data.to_numpy(), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)

In [386]:
find_most_similar_question("Do you want to go to a bar?")

  0%|          | 0/790 [00:00<?, ?it/s]

('Does he want to go out with me?', 0.8843696117401123)

In [387]:
find_most_similar_question("Do you love me?")

  0%|          | 0/790 [00:00<?, ?it/s]

('Why do you love her?', 0.9999998807907104)

## Doc2Vec

In [251]:
def collate_batch(batch):
    l = []
    for i, data in enumerate(batch):
        q1, q2, label, q1_length, q2_length, q1_special_chars, q2_special_chars, q1_stopwords, \
            q2_stopwords, common_words, common_words_count, q1_preprocessed, q2_preprocessed, q1_ngrams, q2_ngrams = data
        l.append(list(filter(lambda a: a != '', q1_preprocessed)))
        l.append(list(filter(lambda a: a != '', q2_preprocessed)))
    return l

In [256]:
tagged_data = []

In [259]:
dataloader = DataLoader(
    data.to_numpy(), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)

counter = 0
for batch in tqdm(dataloader):
    for emb in batch:
        t = TaggedDocument(words=emb, tags=[str(counter)])
        tagged_data.append(t)
        counter += 1

  0%|          | 0/790 [00:00<?, ?it/s]

In [338]:
doc2vec = Doc2Vec(
    vector_size=VEC_SIZE,
    alpha=ALPHA,
    min_alpha=0.00025,
    min_count=1,
    dm=1
)

doc2vec.build_vocab(tagged_data)

In [339]:
for epoch in range(MAX_EPOCHS):
    print('iteration {0}'.format(epoch))
    doc2vec.train(
        tagged_data,
        total_examples=model.corpus_count,
        epochs=1
    )
    # decrease the learning rate
    doc2vec.alpha -= 0.0002
    # fix the learning rate, no decay
    doc2vec.min_alpha = model.alpha

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14


In [341]:
def get_doc2vec_embedding(q):
    result = torch.Tensor(doc2vec.infer_vector(q))
    return result

In [342]:
max_words = 300
embed_len = max_words

In [343]:
single_emb = get_doc2vec_embedding

In [344]:
def collate_batch(batch):
    embeddings_tensor = torch.zeros(len(batch) * 2, max_words)
    counter = 0
    for i, data in enumerate(batch):
        q1, q2, label, q1_length, q2_length, q1_special_chars, q2_special_chars, q1_stopwords, \
            q2_stopwords, common_words, common_words_count, q1_preprocessed, q2_preprocessed, q1_ngrams, q2_ngrams = data

        embeddings_tensor[counter] = get_doc2vec_embedding(q1_preprocessed)
        counter += 1
        embeddings_tensor[counter] = get_doc2vec_embedding(q2_preprocessed)
        counter += 1
    return embeddings_tensor.to(DEVICE)

In [345]:
dataloader = DataLoader(
    data.to_numpy(), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)

In [346]:
find_most_similar_question("Do you want to go to a bar?")

  0%|          | 0/790 [00:00<?, ?it/s]

('Which TMT bars are best for construction of residential houses in India? And why? Which brand name is preferred over the other?',
 0.7966447472572327)

In [347]:
find_most_similar_question("Do you love me?")

  0%|          | 0/790 [00:00<?, ?it/s]

('Why should you love yourself?', 0.82894366979599)