## Load data for training word embeddings
<hr>

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# read data

data = pd.read_csv('data/processed_data_model1.csv')
print(data.shape)
data.head()

(58887, 7)


Unnamed: 0,id,title,corpus,score,polarity,subjectivity,processed_title
0,21454,Specifying a mySQL ENUM in a Django model,specifying mysql enum django model go specifyi...,0.044573,0.0,0.0,specifying mysql enum django model
1,22059,"How do content discovery engines, like Zemanta...",content discovery engines like zemanta open ca...,0.002254,0.0,0.0,content discovery engines like zemanta open ca...
2,29856,Install Python to match directory layout in OS...,install python match directory layout os x 105...,-0.000657,0.0,0.0,install python match directory layout os x 105
3,35569,Why does Python's iter() on a mapping return i...,python iter mapping return iterkeys instead it...,0.001134,-0.2,0.85,python iter mapping return iterkeys instead it...
4,39960,Javascript equivalent of Python's locals()?,javascript equivalent python locals python one...,0.003821,0.0,0.0,javascript equivalent python locals


## Training word embeddings
<hr>

In [3]:
import multiprocessing
from gensim.models import Word2Vec

In [4]:
cores = multiprocessing.cpu_count()
print(cores)

8


In [5]:
import gensim

w2v_model = Word2Vec(min_count=2,
                                         window=2,
                                         size=300,
                                         sample=0, 
                                         alpha=0.03, 
                                         min_alpha=0.0007, 
                                         negative=10,
                                         workers=cores-1)

corpus = [text.split() for text in np.array(data.corpus)]

In [6]:
#train word embeddings

w2v_model.build_vocab(corpus)
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("vocab size",vocab_size)
w2v_model.train(corpus, total_examples=len(corpus), epochs=32)

vocab size 74743


(81551392, 87125344)

In [7]:
w2v_model.save('models/related_questions_model.bin') #saving the trained w2v model

## Sanity check on the trained word embeddings
<hr>

In [8]:
print('Terms most similar to "flask" \n')
print(w2v_model.wv.most_similar("flask"))

Terms most similar to "flask" 

[('django', 0.5578969717025757), ('webpy', 0.44376176595687866), ('pyramid', 0.4416426420211792), ('gae', 0.41677266359329224), ('server', 0.41033756732940674), ('app', 0.4036201238632202), ('web2py', 0.40340906381607056), ('bottle', 0.3942393362522125), ('application', 0.3905590772628784), ('web', 0.3869827687740326)]


In [9]:
print('Terms most similar to "django" \n')
print(w2v_model.wv.most_similar("django"))

Terms most similar to "django" 

[('flask', 0.5578969717025757), ('python', 0.4789502024650574), ('template', 0.4479687511920929), ('sqlalchemy', 0.43455255031585693), ('web2py', 0.42582327127456665), ('pyramid', 0.42052891850471497), ('drf', 0.4037160277366638), ('app', 0.40167367458343506), ('wagtail', 0.39279812574386597), ('database', 0.3831038773059845)]


In [10]:
print('Terms most similar to "api" \n')
print(w2v_model.wv.most_similar("api"))

Terms most similar to "api" 

[('apis', 0.4997870922088623), ('interface', 0.4356759786605835), ('endpoint', 0.40410172939300537), ('script', 0.39903372526168823), ('service', 0.39255058765411377), ('app', 0.3874899744987488), ('appengine', 0.38615939021110535), ('function', 0.3830541968345642), ('oauth', 0.382467657327652), ('code', 0.38190385699272156)]


## Relevant questions retrieval model
<hr>

In [11]:
import os
import numpy as np
import pandas as pd

## Load data
<hr>

In [12]:
data = pd.read_csv('data/processed_data_model1.csv')
data

Unnamed: 0,id,title,corpus,score,polarity,subjectivity,processed_title
0,21454,Specifying a mySQL ENUM in a Django model,specifying mysql enum django model go specifyi...,0.044573,0.000000,0.000000,specifying mysql enum django model
1,22059,"How do content discovery engines, like Zemanta...",content discovery engines like zemanta open ca...,0.002254,0.000000,0.000000,content discovery engines like zemanta open ca...
2,29856,Install Python to match directory layout in OS...,install python match directory layout os x 105...,-0.000657,0.000000,0.000000,install python match directory layout os x 105
3,35569,Why does Python's iter() on a mapping return i...,python iter mapping return iterkeys instead it...,0.001134,-0.200000,0.850000,python iter mapping return iterkeys instead it...
4,39960,Javascript equivalent of Python's locals()?,javascript equivalent python locals python one...,0.003821,0.000000,0.000000,javascript equivalent python locals
...,...,...,...,...,...,...,...
58882,65163096,Comparing values with __eq__ in Python,comparing values _ _ eq _ _ python good day ni...,-0.001329,-0.083333,0.283333,comparing values _ _ eq _ _ python
58883,65163335,How to get PYQT5 text box to be taken as an in...,get pyqt5 text box taken input sqlite trying c...,-0.001105,0.000000,0.000000,get pyqt5 text box taken input sqlite
58884,65163681,How to assign one argument to function and mak...,assign one argument function make list later q...,-0.001329,0.000000,0.000000,assign one argument function make list later
58885,65163947,Iterate over a list based on list based on a l...,iterate list based list based list steps want ...,0.000686,-0.055556,0.307937,iterate list based list based list steps


In [13]:
import re
import nltk
import inflect
from nltk.corpus import stopwords

#process text

import spacy
en = spacy.load('en_core_web_sm')

import nltk
nltk.download('stopwords')

#tokenizer

def tokenize(text):
    tokens = en.tokenizer(text)
    return [token.text.lower() for token in tokens if not token.is_space]

#remove punctuations

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words 

#remove stop words

def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def normalize(words):
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

def process_text(text):
    return ' '.join(normalize(tokenize(text)))

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bhavana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Import saved word embeddings
<hr>

In [14]:
#getting the saved word embeddings

from gensim.models import Word2Vec

w2v_model = Word2Vec.load('models/related_questions_model.bin')
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)

print("vocab size",vocab_size)

vocab size 74743


## Calculate sentence embeddings
<hr>

In [15]:
#word to numerical vector using the trained word embeddings

def question_to_vec(question, embeddings, dim = 300):
    question_embedding = np.zeros(dim)
    valid_words = 0
    for word in str(question).split(' '):
        if embeddings.wv.__contains__(word):
            valid_words+=1
            question_embedding += embeddings.wv.__getitem__(word)
        if valid_words>0:
            return question_embedding/valid_words
        else:
            return question_embedding

In [16]:
#converting all the question titles to vectors and saving them

title_embeddings = []

for title in data.processed_title:
    title_embeddings.append(question_to_vec(question = title, embeddings = w2v_model))
    
title_embeddings = np.array(title_embeddings)
embeddings = pd.DataFrame(data = title_embeddings)
embeddings[0:15000].to_csv('models/title_embeddings1.csv', index=False)
embeddings[15000:30000].to_csv('models/title_embeddings2.csv', index=False)
embeddings[30000:45000].to_csv('models/title_embeddings3.csv', index=False)
embeddings[45000:].to_csv('models/title_embeddings4.csv', index=False)

print(w2v_model)

Word2Vec(vocab=74743, size=300, alpha=0.03)


## Cosine similarity
<hr>

In [32]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import HTML

em1 = pd.read_csv('models/title_embeddings1.csv')
em2 = pd.read_csv('models/title_embeddings2.csv')
em3 = pd.read_csv('models/title_embeddings3.csv')
em4 = pd.read_csv('models/title_embeddings4.csv')

em = em1.append([em2, em3, em4])

title_embeddings = np.array(em)

query = 'merge two lists in python'
processed_query = process_text(query)

results_returned = 1000

query_vect = np.array([question_to_vec(processed_query, w2v_model)]) # Vectorize the user query

cosine_similarities = pd.Series(cosine_similarity(query_vect, title_embeddings)[0])

relevant_questions = []
max_cosine_score = max(cosine_similarities)

cos_weight = 30
freq_weight = 60
ans_weight = 6
polar_weight = 2
subj_weight = 2

for index, cosine_score in cosine_similarities.nlargest(results_returned).iteritems():
    
    freq_score = 0
    word_count = 0
    score = 0
    
    for word in data.processed_title[index].split():
        if word.lower() in processed_query:
            freq_score+=1
        word_count+=1
        
    freq_score/=word_count
    
    score =  (cos_weight*(cosine_score/max_cosine_score)+freq_weight*freq_score+ans_weight*data.score[index]+polar_weight*data.polarity[index]+subj_weight*data.subjectivity[index])/100
        
    relevant_questions.append((index, data.id[index], score))
    
relevant_questions.sort(key = lambda x : x[2], reverse = True)

output = ''

for index, qid, score in relevant_questions:
    output += '<p style="font-family:verdana; font-size:110%;"> '
    output += 'question id : '+str(qid)+'<br>similarity score : ' + str(score) + '<br>'
    for word in data.title[index].split():
        if word.lower() in processed_query:
            output += " <b>"+str(word)+"</b>"
        else:
            output += " "+str(word)
    output += "</p><hr>"

output = '<h3>Results:</h3>'+output
display(HTML(output))

df = pd.DataFrame(relevant_questions).iloc[:,1:]
df.to_csv('data/relevant_questions.csv', index=False)

In [42]:
qdf = pd.read_csv('data/relevant_questions.csv')
qdf.head(10)

Unnamed: 0,1,2
0,48641350,0.799961
1,46362972,0.785947
2,1158128,0.750189
3,64812701,0.749934
4,44476206,0.74992
5,58971955,0.709934
6,35526224,0.70117
7,49734759,0.674987
8,34761978,0.660068
9,30620323,0.660041
