# Search Process

Take the search query as input and find the most similar question in the database

In [24]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [25]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk import RegexpTokenizer
from sklearn import preprocessing

import spacy
from spacy.lang.en import English
import spacy
EN = spacy.load('en_core_web_sm')

from IPython.display import HTML, display
import logging
logging.getLogger('tensorflow').disabled = True 

## Import Database and Word Embedding Vectors

In [26]:
data = pd.read_csv('/content/gdrive/My Drive/Stackoverflow_VS_extension/Preprocessed_data.csv')
data

Unnamed: 0,original_title,post_corpus,question_content,question_url,tags,overall_scores,answers_content,processed_title
0,Using 'in' to match an attribute of Python obj...,using match attribute python objects array nt ...,using match attribute python objects array nt ...,https://stackoverflow.com/questions/683,python|arrays|iteration,0.011301,Using a list comprehension would build a tempo...,using match attribute python objects array
1,Python version of PHP's stripslashes,python version php stripslashes wrote piece co...,python version php stripslashes wrote piece co...,https://stackoverflow.com/questions/13454,python|string|escaping,0.001115,Python has a built-in escape() function analog...,python version php stripslashes
2,Unicode vs UTF-8 confusion in Python / Django?,unicode vs utf8 confusion python django stumbl...,unicode vs utf8 confusion python django stumbl...,https://stackoverflow.com/questions/22149,python|django|unicode,0.006997,From Wikipedia on UTF-8:,unicode vs utf8 confusion python django
3,Using Django time/date widgets in custom form,using django time date widgets custom form use...,using django time date widgets custom form use...,https://stackoverflow.com/questions/38601,python|django,0.041431,"Starting in Django 1.2 RC1, if you're using th...",using django time date widgets custom form
4,Can parallel traversals be done in MATLAB just...,parallel traversals done matlab python using f...,parallel traversals done matlab python using f...,https://stackoverflow.com/questions/49307,python|arrays|matlab|for-loop,0.002837,should be for example:,parallel traversals done matlab python
...,...,...,...,...,...,...,...,...
147054,How can I insert spaces between words given a ...,insert spaces words given list lists coded let...,insert spaces words given list lists coded let...,https://stackoverflow.com/questions/63758260,python|list|dictionary|spacing,-0.000607,Just append another whitespace in array:,insert spaces words given list lists coded let...
147055,Django creates another media folder inside med...,django creates another media folder inside med...,django creates another media folder inside med...,https://stackoverflow.com/questions/63758482,python|django|python-imaging-library,-0.000176,The parameter [Django-doc] is relative to the...,django creates another media folder inside med...
147056,Options for deploying Flask app that continuou...,options deploying flask app continuously web s...,options deploying flask app continuously web s...,https://stackoverflow.com/questions/63758866,python|flask|heroku|web-scraping|web-applications,-0.000607,Have you tried using Cron? There is no cost no...,options deploying flask app continuously web s...
147057,"Delete ""nan"" in python list",delete nan python list new python simple quest...,delete nan python list new python simple quest...,https://stackoverflow.com/questions/63758902,python|list,-0.000750,You can ... get creative:my_list = ['experienc...,delete nan python list


In [27]:
import gensim
w2v_model = gensim.models.word2vec.Word2Vec.load('/content/gdrive/My Drive/Stackoverflow_VS_extension/SO_word2vec_embeddings.bin')

## Calculate Mean Embedding for a query
In a query, take the mean value of the word embedding vector for each word and use this mean vector as the representation of the query. 

In [28]:
def question_to_vec(question, embeddings, dim=300):
    question_embedding = np.zeros(dim)
    valid_words = 0
    for word in question.split(' '):
        if word in embeddings:
            valid_words += 1
            question_embedding += embeddings[word]
    if valid_words > 0:
        return question_embedding/valid_words
    else:
        return question_embedding

Calculate the mean embedding vector for all the titles in our database and store it in all title embedding

In [29]:
all_title_embeddings = []
for title in data.processed_title:
    all_title_embeddings.append(question_to_vec(title, w2v_model))
all_title_embeddings = np.array(all_title_embeddings)

embeddings = pd.DataFrame(data = all_title_embeddings)
embeddings.to_csv('/content/gdrive/My Drive/Stackoverflow_VS_extension/title_embeddings.csv', index=False)


  """
  import sys


In [30]:
all_title_embeddings = pd.read_csv('/content/gdrive/My Drive/Stackoverflow_VS_extension/title_embeddings.csv').values
print(all_title_embeddings.shape)

(147059, 300)


## Import Word Embedding Vectors for all words

In [31]:
## Embedding Matrix
import pickle
with open('/content/gdrive/My Drive/Stackoverflow_VS_extension/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
word_index = tokenizer.word_index
vocab_size = len(word_index)
print('Found %s unique tokens.' % len(word_index))
W2V_SIZE = 300
MAX_SEQUENCE_LENGTH = 300

embedding_matrix = np.zeros((vocab_size+1, W2V_SIZE))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

Found 471299 unique tokens.
(471300, 300)


## Import and Load the tag predictor model

In [32]:
from tensorflow.keras.layers import Dense, Embedding,GRU
from tensorflow.keras.layers import BatchNormalization
from datetime import datetime
from time import time
from keras.utils.vis_utils import plot_model
from tensorflow.keras.models import Sequential

model = Sequential()
model.add(Embedding(vocab_size+1, W2V_SIZE, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(GRU(500, activation='relu',kernel_initializer='he_normal'))
model.add(BatchNormalization())
model.add(Dense(150,activation='relu'))
model.add(BatchNormalization())
model.add(Dense(500, activation='sigmoid'))
model.summary()

model.compile(loss="binary_crossentropy",
              optimizer="adam")

model.load_weights('/content/gdrive/My Drive/Stackoverflow_VS_extension/Tag_predictor_weights.h5')

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 300)          141390000 
_________________________________________________________________
gru_1 (GRU)                  (None, 500)               1203000   
_________________________________________________________________
batch_normalization_2 (Batch (None, 500)               2000      
_________________________________________________________________
dense_2 (Dense)              (None, 150)               75150     
_________________________________________________________________
batch_normalization_3 (Batch (None, 150)               600       
_________________________________________________________________
dense_3 (Dense)              (None, 500)               75500     
Total params: 142,746,250
Trainable params: 1,354,950
Non-trainable params: 141,391,300
________________________________

In [33]:
from sklearn.preprocessing import MultiLabelBinarizer
tag_encoder = MultiLabelBinarizer()

# Returns the predicted tags in the form multiclass probablity
def predict_tags_get_one_hot_vector(text):
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=MAX_SEQUENCE_LENGTH)
    prediction = model.predict([x_test])[0]
    return prediction

## Query Normalization Functions
These functions are used to preprocess the query that was given to our search. They tokenize the query, removes punctuation, remove stopwords

In [34]:
import re
import nltk

from nltk.corpus import stopwords
nltk.download('stopwords')
def tokenize_text(text):
    "Use spacy to tokenize."
    tokens = EN.tokenizer(text)
    return [token.text.lower() for token in tokens if not token.is_space]


def to_lowercase(words):
    """convert to lowercase"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation """
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def normalize(words):
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

def tokenize_code(text):
    
    return RegexpTokenizer(r'\w+').tokenize(text)

def preprocess_text(text):
    return ' '.join(normalize(tokenize_text(text)))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Indexing Search Results
We use the Cosine Similarity as our main measure to index search result. Our searching process consist of:
1. Preprocessing the query
2. Predicting tags on our search query and caculating its cosine similarity with all question tags.
3. Calculating query's mean embedding vector and calculating its cosine similarity with the mean embedding vectors of all questions in our database.

We then add these two cosine similarity values with the score of each question to index our search results.

In [36]:

data.tags = data.tags.apply(lambda x: x.split('|'))
tag_freq_dict = {}
for tags in data.tags:
    for tag in tags:
        if tag not in tag_freq_dict:
            tag_freq_dict[tag] = 0
        else:
            tag_freq_dict[tag] += 1


tags_to_use = 500
tag_freq_dict_sorted = dict(sorted(tag_freq_dict.items(), key=lambda x: x[1], reverse=True))
final_tags = list(tag_freq_dict_sorted.keys())[:tags_to_use]
print(len(final_tags))


final_tag_data = []
for tags in data.tags:
    temp = []
    for tag in tags:
        if tag in final_tags:
            temp.append(tag)
    final_tag_data.append(temp)

from sklearn.preprocessing import MultiLabelBinarizer
tag_encoder = MultiLabelBinarizer()
tags_encoded = tag_encoder.fit_transform(final_tag_data)
tags_encoded.shape

500


(147059, 500)

In [47]:
from IPython.display import HTML
import logging
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
search_string = "Index out of bound" 
original_string = search_string
search_string = ' '.join(normalize(tokenize_text(search_string)))
results_returned = "5" 
search_vect = np.array([question_to_vec(search_string, w2v_model)])    

# Adding cosine similarity using title embeddings
cosine_similarities = pd.Series(cosine_similarity(search_vect, all_title_embeddings)[0])
predicted_tags = predict_tags_get_one_hot_vector(original_string)
predicted_tags = np.array(predicted_tags)[np.newaxis]
cosine_similarities_tags = pd.Series(cosine_similarity(predicted_tags, tags_encoded)[0])

# Adding cosine similarity for predicted tags
cosine_similarities+=cosine_similarities_tags

# Taking overall score (votes) also into account
cosine_similarities = cosine_similarities*(1 + 0.6*data.overall_scores )

output =""
for i,j in cosine_similarities.nlargest(int(results_returned)).iteritems():
    output += '<a target="_blank" href='+ str(data.question_url[i])+'><h2>' + data.original_title[i] + '</h2></a>'
    output +='<p style="font-family:verdana; font-size:110%;"> '
    output += data.question_content[i] + '</p><hr>'
    
output = '<h3>Results:</h3>'+output
display(HTML(output))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  """
  import sys
