In [1]:
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
import gensim  # for LDA
import nltk  # NLP lib
import math
import re
import numpy as np  # numpy for arrays
import pandas as pd  # pandas lib for data handling
import itertools
import random
from nltk.corpus import stopwords  # common eng sstopwords
from nltk.stem import PorterStemmer  # porter stemmer algo
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup  # HTML to txt
import os
import sys
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
ps = PorterStemmer()
sw_nltk = stopwords.words('english')  # bag of all common english stop words

import warnings
warnings.filterwarnings('ignore')


[nltk_data] Downloading package punkt to /home/abhishek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abhishek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/abhishek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/abhishek/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /home/abhishek/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~``'''
contract_strings = ["'s", "\"", "'ve","'m",]


def TokStemming(text, j):  # Tokenise and Stem the given text
    words = [word for word in text.split() if word.lower() not in sw_nltk]  # bag of tokenised words which are not stopwords
    final = []
    for w in words:
        if w not in punctuations and w not in contract_strings:
            final.append(ps.stem(w))
    if j == 1:  # give a string
        return " ".join(final)
    return final

def hbt(text):  # convert HTML body to text
    x = BeautifulSoup(text).get_text()
    return x

def rmTags(text):  
    text2 = text.replace('<', '')
    text3 = text2.replace('>', ' ')
    return text3


In [3]:
df = pd.read_csv('../Dataset/QueryResults.csv')
N1 = 1000    # past questions
N2 = 300    # dup questions
# duplicate questions for training (with sorting)
past_df = df[['PastQuesId', 'PastQuesTitle', 'PastQuesBody', 'PastQuesTags']].loc[0:N1-1].copy()  # original qns
dup_df = df[['DuplicateQuesId', 'DuplicateQuesTitle', 'DuplicateQuesBody', 'DuplicateQuesTags']].loc[0:N2-1].copy()  # doop qns
print(past_df.head())
print(dup_df.head())


   PastQuesId                                      PastQuesTitle  \
0      205853  Why would a JavaScript variable start with a d...   
1      104799  Why aren't Java Collections remove methods gen...   
2      255815  How can I fix my regex to not match too much w...   
3      190740               Setting ruby hash .default to a list   
4      256277  What is a good reference documenting patterns ...   

                                        PastQuesBody  \
0  <p>I quite often see JavaScript with variables...   
1  <p>Why isn't <a href="http://java.sun.com/java...   
2  <p>I have the following line:</p>\n\n<pre><cod...   
3  <p>I thought I understood what the default met...   
4  <p>"C Interfaces and Implementations"  shows s...   

                                   PastQuesTags  
0              <javascript><naming-conventions>  
1                 <java><generics><collections>  
2  <regex><perl><parsing><greedy><regex-greedy>  
3                               <ruby><hashmap>  
4   

In [4]:
# preprocess the data

dup_df['DuplicateQuesTitle']=dup_df['DuplicateQuesTitle'].apply(lambda x:TokStemming(hbt(x),1))
dup_df['DuplicateQuesTags']=dup_df['DuplicateQuesTags'].apply(lambda x:rmTags(x))
dup_df['DuplicateQuesBody']=dup_df['DuplicateQuesBody'].apply(lambda x:TokStemming(hbt(x),1))

past_df['PastQuesTitle']=past_df['PastQuesTitle'].apply(lambda x:TokStemming(hbt(x),1))
past_df['PastQuesTags']=past_df['PastQuesTags'].apply(lambda x:rmTags(x))
past_df['PastQuesBody']=past_df['PastQuesBody'].apply(lambda x:TokStemming(hbt(x),1))

print(dup_df.head())
print(past_df.head())


   DuplicateQuesId                                 DuplicateQuesTitle  \
0           846585                     purpos dollar sign javascript?   
1           857420           reason map.get(object key) (fully) gener   
2            22444                       regex match much. make stop?   
3          2698460  strange, unexpect behavior (disappearing/chang...   
4          1804486                               use #includ headers?   

                                   DuplicateQuesBody  \
0  code question here: var $item = $(this).parent...   
1  reason behind decis fulli gener get method int...   
2  gigant ugli string: j0000000: transact a000140...   
3  consid code: h = hash.new(0) new hash pair def...   
4  necessari #includ file, insid header (*.h), ty...   

                     DuplicateQuesTags  
0       javascript naming-conventions   
1       java generics collections map   
2                               regex   
3                           ruby hash   
4  c c-preprocessor

In [5]:
def cosine_similarity(a, b):
    c = list(set(a).union(set(b)))
    freq_a = np.array([a.count(i)/(a.count(i)+b.count(i)) for i in c])
    freq_b = np.array([b.count(i)/(a.count(i)+b.count(i)) for i in c])

    final = np.dot(freq_a, freq_b)/(np.linalg.norm(freq_a)*np.linalg.norm(freq_b))
    if(np.isnan(final)):
        return 0
    return final


In [6]:
#train lda model
body_title = []
for i in range(N1):
    body_title.append((past_df.loc[i, 'PastQuesBody']+past_df.loc[i, 'PastQuesTitle']).split())

id2word = corpora.Dictionary(body_title)

corpus = []
for text in body_title:
    corpus.append(id2word.doc2bow(text))

lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=100, minimum_probability=0.0)

lda_score_past = []
lda_score_dup = []
for i in range(N1):
    corpus_past = id2word.doc2bow((past_df.loc[i, 'PastQuesBody']+past_df.loc[i, 'PastQuesTitle']).split())
    lda_score_past.append(np.array(lda_model[corpus_past])[:, 1])
for i in range(N2):
    corpus_dup = id2word.doc2bow((dup_df.loc[i, 'DuplicateQuesBody']+dup_df.loc[i, 'DuplicateQuesTitle']).split())
    lda_score_dup.append(np.array(lda_model[corpus_dup])[:, 1])


In [16]:

similarity_matrix = np.zeros([N1, N2, 4])  # store all the component scores [ori,doop,4]
for i in range(N1):
  for j in range(N2):
    # find cosine similarity for title, body and tags of i and j
    similarity_matrix[i, j, 0] = cosine_similarity(past_df.loc[i, 'PastQuesTitle'].split(), dup_df.loc[j, 'DuplicateQuesTitle'].split())
    similarity_matrix[i, j, 1] = cosine_similarity(past_df.loc[i, 'PastQuesBody'].split(), dup_df.loc[j, 'DuplicateQuesBody'].split())
    similarity_matrix[i, j, 2] = cosine_similarity(past_df.loc[i, 'PastQuesTags'].split(), dup_df.loc[j, 'DuplicateQuesTags'].split())
    # similarity_matrix[i, j, 3] = np.dot(val_past, val_dup)/(np.linalg.norm(val_past)*np.linalg.norm(val_dup))
    similarity_matrix[i, j, 3] = np.dot(lda_score_past[i], lda_score_dup[j])/(np.linalg.norm(lda_score_past[i])*np.linalg.norm(lda_score_dup[j]))
    if np.isnan(similarity_matrix[i, j, 3]):
      similarity_matrix[i, j, 3] = 0 

# save the similarity matrix
np.save('similarity_matrix.npy', similarity_matrix)


#### Evaluation Criteria

In [17]:
def evaluate(abcd, k):
    score_matrix = np.zeros([N1, N2])
    abcd = np.array(abcd)
    cur_val=0
    for i in range(N2):
        scores=[]
        for j in range(N1):
            scores.append(np.dot(similarity_matrix[j,i,:],abcd))
        score_matrix[:,i]=scores
    ans = 0
    for i in range(N2):
        top_k = []
        for j in range(N1):
            top_k.append((j, score_matrix[j, i]))
        top_k.sort(key=lambda x: x[1], reverse=True)
        top_k = top_k[:k]
        for j in range(k):
            if top_k[j][0] == i:
                ans += 1
                break
    return ans

#### Approach 1: check different values of a,b,c,d iteratively

In [111]:
similarity_matrix = np.load('similarity_matrix.npy')
divisions = 10
maximum = 0
possibility = np.linspace(0, 1, divisions)
best = np.array([0, 0, 0, 0])
for abcd in itertools.product(possibility,possibility,possibility,possibility):
    if sum(abcd) == 0:
        continue
    cur_val = evaluate(abcd, 1)
    if cur_val>maximum:
        maximum=cur_val
        best=abcd
print(best, maximum)


(0.7777777777777777, 1.0, 0.3333333333333333, 0.1111111111111111) 144


#### Approach 2: check different values of a,b,c,d randomly

In [22]:

similarity_matrix = np.load('similarity_matrix.npy')
iter = 5
alpha = 0.01
k=20
final_abcd = np.array([0, 0, 0, 0])
final_score = 0
iter_score = []
for i in range(iter):
  abcd = np.random.rand(4)
  abcd[1]=0
  maximum = evaluate(abcd, k)
  for j in range(4):
    if(j==1):
      continue
    best_j = abcd[j]
    abcd[j]=0
    while(abcd[j]<=1):
      cur_val = evaluate(abcd, k)
      if cur_val>maximum:
          maximum=cur_val
          best_j=abcd[j]
      abcd[j]+=alpha
    abcd[j] = best_j
  iter_score.append([abcd, maximum])
  print(abcd, maximum)
  if maximum>final_score:
    final_score = maximum
    final_abcd = abcd
print(final_abcd, final_score)

# print(iter_score)


[0.98 0.   0.97 0.06] 255
[0.56 0.   0.49 0.09] 258
[0.95 0.   0.99 0.06] 255
[0.90344218 0.         0.93       0.06      ] 255
[0.99 0.   0.99 0.22] 256
[0.56 0.   0.49 0.09] 258


### LDA set for testing data

In [24]:
def train_lda_model(data):
    body_title = []
    for i in range(len(data)):
        body_title.append(
            (data.loc[i, 'PastQuesBody']+data.loc[i, 'PastQuesTitle']).split())
    id2word = corpora.Dictionary(body_title)

    corpus = []
    for text in body_title:
        corpus.append(id2word.doc2bow(text))
    lda_model = gensim.models.LdaMulticore(
        corpus=corpus, id2word=id2word, num_topics=100, minimum_probability=0.0)

    final = []
    for i in range(len(data)):
        corpus_past = id2word.doc2bow(
            (data.loc[i, 'PastQuesBody']+data.loc[i, 'PastQuesTitle']).split())
        final.append(np.array(lda_model[corpus_past])[:, 1])
    return lda_model, final


In [36]:
cur_df = df[['PastQuesId', 'PastQuesTitle',
             'PastQuesBody', 'PastQuesTags']].loc[:2500]
trained_lda_model, trained_lda_data = train_lda_model(cur_df)
final_abcd = [0.56, 0 ,0.49 ,0.09]


## Top K Questions

In [38]:

# print(trained_lda_data)
def get_top_k(title, body, tags, k):
    title = TokStemming(hbt(title), 0)
    body = TokStemming(hbt(body), 0)
    tags = rmTags(tags)
    corp = id2word.doc2bow(body+title)
    cur_lda_score = np.array(trained_lda_model[corp])[:, 1]
    similarity_matrix = np.zeros([len(trained_lda_data), 4])
    final_scores = []
    for i in range(len(trained_lda_data)):
        similarity_matrix[i][0] = cosine_similarity(
            title, TokStemming(hbt(cur_df.loc[i, 'PastQuesTitle']), 0))
        similarity_matrix[i][1] = cosine_similarity(
            body, TokStemming(hbt(cur_df.loc[i, 'PastQuesBody']), 0))
        similarity_matrix[i][2] = cosine_similarity(
            tags, rmTags(cur_df.loc[i, 'PastQuesTags']))
        similarity_matrix[i][3] = np.dot(trained_lda_data[i], cur_lda_score)/(
            np.linalg.norm(trained_lda_data[i])*np.linalg.norm(cur_lda_score))
        if(np.isnan(similarity_matrix[i][3])):
            similarity_matrix[i][3] = 0
        final_scores.append([i, np.dot(similarity_matrix[i], final_abcd)])
    final_scores.sort(key=lambda x: x[1], reverse=True)
    return final_scores


In [26]:
def get_top_k_helper(title, body, tags, k):
    indices = get_top_k(title, body, tags, k)
    output = ''
    for i in range(len(indices)):
        output += "\n \n ----- Question " + str(i+1) + "----\n"
        output += "\n Score---> " + str(round(indices[i][1], 4))
        output += "\n Title ---> " + \
            str(df.loc[indices[i][0], 'PastQuesTitle'])
        output += "\n Body ---> " + str(df.loc[indices[i][0], 'PastQuesBody'])
        output += "\n Tags ---> " + str(df.loc[indices[i][0], 'PastQuesTags'])
        output += "\n---------------------------------------------------------------"
    return output


In [40]:
offset = 1000
count = 100
recall1 = 0
recall2 = 0
recall3 = 0
for i in range(offset, offset+count):
    ans = get_top_k(df.loc[i, 'DuplicateQuesTitle'], df.loc[i,
                    'DuplicateQuesBody'], df.loc[i, 'DuplicateQuesTags'], 5)
    ans1 = ans[0:5]
    ans2 = ans[0:10]
    ans3 = ans[0:20]
    for j in ans1:
        if j[0] == i:
            recall1 += 1
            break
    for j in ans2:
        if j[0] == i:
            recall2 += 1
            break
    for j in ans3:
        if j[0] == i:
            recall3 += 1
            break

print("Recall@5 = " + str(recall1/count))
print("Recall@10 = " + str(recall2/count))
print("Recall@20 = " + str(recall3/count))


Recall@5 = 0.4
Recall@10 = 0.49
Recall@20 = 0.59


## Example Question for Testing

In [29]:
title = 'Why would a JavaScript variable start with a dollar sign?'
body = "<p>I quite often see JavaScript with variables that start with a dollar sign. When/why would you choose to prefix a variable in this way?</p><p>(I'm not asking about <code>$('p.foo')</code> syntax that you see in jQuery and others, but normal variables like <code>$name</code> and <code>$order</code>)</p>"
tags = "<javascript><naming-conventions>"
final_abcd = [0.76, 0.98, 0.29, 0.07]

print(get_top_k_helper(title, body, tags, 20))



 
 ----- Question 1----

 Score---> 1.8843
 Title ---> Why would a JavaScript variable start with a dollar sign?
 Body ---> <p>I quite often see JavaScript with variables that start with a dollar sign. When/why would you choose to prefix a variable in this way?</p>

<p>(I'm not asking about <code>$('p.foo')</code> syntax that you see in jQuery and others, but normal variables like <code>$name</code> and <code>$order</code>)</p>

 Tags ---> <javascript><naming-conventions>
---------------------------------------------------------------
 
 ----- Question 2----

 Score---> 0.3066
 Title ---> JavaScript large number library?
 Body ---> <p>Is there a library available in javascript to handle large numbers with accuracy (sort of like what oracle does by storing the number as a string) and allows for operations like add/subtract/multiply/divide/mod/etc ?</p>

<p>Basically I need to add large numbers like 1234567890.1234567890 + 1234567890.987654321 and get a precise result.</p>

 Tags ---> <

## Interface

In [10]:
import tkinter as tk
from tkinter import *
from tkinter import ttk
from tkinter import filedialog as fd
from tkinter.messagebox import showinfo
from PIL import ImageTk, Image
import matplotlib.pyplot as plt
import tkinter.scrolledtext as scrolledtext
import PIL
import time


In [42]:

rootWindow = tk.Tk()
rootWindow.title('StackOverFlow Duplicate Question Detector')
rootWindow.resizable(True, True)
rootWindow.geometry('1920x1080')

my_frame = Frame(rootWindow, width=576, height=324, bg='black')
my_frame.pack(fill="both", expand=True)

# imgaddress = PIL.Image.open(
#     "/home/abhishek/Desktop/3-1/smai/StackOverFlow_Duplicate_Question_Detection/images/bg7.jpg")
# img = ImageTk.PhotoImage(imgaddress)
# label = Label(
#     my_frame,
#     image=img
# )
# label.place(x=0, y=0)

# global &/ initialisations
title, body, tag = "Title", "Body", "Tag"
titles = ["one", "two", "three"]
kval = 5
abcd = [0.8,0.51,0.37,0.01]
#######################

# changing the title and retrieving the title


def onChangeTitle(tite):
    global title, titles
    title = tite.widget.get()
    print("Title Entered:", title)

# changing the body and retrieving the body


def onChangeBody(bode):
    global body
    body = bode.widget.get()
    print("Body Entered:", body)

# changing the tag and retrieving the tag


def onChangetags(tags):
    global tag
    tag = tags.widget.get()
    print("Tags Entered:", tag)

# changing the k value and retrieving the k value


def onChangek(k):
    global kval
    kval = int(k.widget.get())
    print("K-value Entered:", kval)

# show (top-k questions) results in output section


def showResults():
    global title, body, tag
    print(title,body,tag)
    fs = get_top_k_helper(title, body, tag, kval)
    print(fs)
    txt.delete('1.0', END)
    txt.insert('insert', fs, "\n")


#=============Title Box =============#
L2 = Label(rootWindow, text="Enter the Title:", fg='black', bg='white')
L2.pack(side=LEFT)
L2.place(x=70, y=55)
EntryTitle = Entry(rootWindow, bd=2, textvariable=title)
EntryTitle.pack()
EntryTitle.place(x=200, y=50, width=1200, height=35)
EntryTitle.bind("<Return>", onChangeTitle)

#=============Body Box =============#
L3 = Label(rootWindow, text="Enter the Body:", fg='black', bg='white')
L3.pack(side=LEFT)
L3.place(x=70, y=110)
EntryBody = Entry(rootWindow, bd=2, textvariable=body)
EntryBody.pack()
EntryBody.place(x=200, y=105, width=1200, height=35)
EntryBody.bind("<Return>", onChangeBody)

# =============Tags Box =============#
L4 = Label(rootWindow, text="Enter the Tags:", fg='black', bg='white')
L4.pack(side=LEFT)
L4.place(x=70, y=165)
EntryTags = Entry(rootWindow, bd=2, textvariable=tag)
EntryTags.pack()
EntryTags.place(x=200, y=160, width=1200, height=35)
EntryTags.bind("<Return>", onChangetags)

# =============K-value Box =============#
L5 = Label(rootWindow, text="Enter k-value:", fg='black', bg='white')
L5.pack(side=LEFT)
L5.place(x=70, y=220)
ke = Entry(rootWindow, bd=2, textvariable=kval)
ke.pack()
ke.place(x=200, y=215, width=100, height=35)
ke.bind("<Return>", onChangek)

# ================submit button=================#
submitButton = Button(rootWindow, bg='black', fg='white',
                      text='Submit', command=showResults)
submitButton.pack(expand=True)
submitButton.place(x=950, y=245)

txt = scrolledtext.ScrolledText(
    rootWindow, undo=True, wrap='word', width=150, height=20)
txt['font'] = ('consolas', '12')
txt.pack(expand=True, fill='both')
txt.place(x=200, y=290)

rootWindow.mainloop()
