In [1]:
import re
import os
import copy
import math
import random
import string
import pathlib
import itertools

import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer

#mentio path to datasets here.
base_path = pathlib.Path("topublish/")
tqdm.pandas()

In [2]:
import pickle
import pathlib


def load_text_file(filename):
    if not pathlib.Path(filename).exists(): 
        print("FILE NOT EXISTENT")
        return None

    with open(filename, "r", errors="ignore") as fp:
        return fp.readlines()


def save_pickle_file(data, filename):
    with open(filename, "wb") as fp:
        pickle.dump(data, fp)


def load_pickle_file(filename):
    data = None
    with open(filename, "rb") as fp:
        data = pickle.load(fp)

    return data

In [3]:
def split_text(text):
   return text.split()


def load_corpus(lang="java", verbose=False):
    lines = load_text_file(base_path / pathlib.Path(f"{lang}/{lang}_qid2all.txt"))


    record_list = list()
    for line in tqdm(lines, disable=not verbose):
        record_list.append(
            {name: text.strip(string.whitespace) for name, text in zip(["qid", "title", "body", "answer"], line.split("\t"))}
        )
            
    java_corpus_dataframe = pd.DataFrame(record_list)

    return java_corpus_dataframe

def load_corpus1(lang="java", verbose=False):
    lines = load_text_file(base_path / pathlib.Path(f"{lang}/{lang}_cosidf.txt"))


    record_list = list()
    for line in tqdm(lines, disable=not verbose):
        record_list.append(
            {name: text.strip(string.whitespace) for name, text in zip(["qid1", "qid2", "score", "label"], line.split("\t"))}
        )
            
    javascosidf_corpus_dataframe = pd.DataFrame(record_list)

    return javascosidf_corpus_dataframe

In [4]:
pd.set_option("display.max_columns", 10)

java_corpus_dataframe = load_corpus(lang="java", verbose=True)
print(java_corpus_dataframe)

100%|██████████| 700552/700552 [00:02<00:00, 322006.26it/s]


             qid                                              title  \
0        8388608  java swing update component content independently   
1       31424546            eclipse mars starts exit code using jdk   
2       23068676  get current timestamp string format java yyyy ...   
3        4194310  java string indexof handle regular expression ...   
4       31457289           efficient method updating observablelist   
...          ...                                                ...   
700547  33554430  save room reservation using angularjs spring h...   
700548  14680054  java regular expression escape regular expression   
700549  29360123                        server side redirect logged   
700550   8388604        without arguments proguard assume arguments   
700551  41943038                         confused integer char type   

                                                     body  \
0       could update content several visible one time ...   
1       plan moving eclip

In [5]:
#getting required jsons for java
import json
l=[]
for i in range(len(java_corpus_dataframe)):
  d={}
  d["index"]={"_id":java_corpus_dataframe.qid[i]}
  l.append(d)
  dbody={}
  dbody["title"]=java_corpus_dataframe.title[i]
  dbody["body"]=java_corpus_dataframe.body[i]
  dbody["answer"]=java_corpus_dataframe.answer[i]
  l.append(dbody)

L = len(l)

#with open("java.json","w") as studs:
 #for i in l:
  # json.dump(i,studs)

 
with open('java1.json', 'a') as outfile:
    for i in range(0,(L//6)-1):
        json.dump(l[i], outfile)
        outfile.write('\n')


with open('java2.json', 'a') as outfile:
    for i in range((L//6)-1,((2*L)//6)):
        json.dump(l[i], outfile)
        outfile.write('\n')

with open('java3.json', 'a') as outfile:
    for i in range(((2*L)//6),((3*L)//6)):
        json.dump(l[i], outfile)
        outfile.write('\n')
        

with open('java4.json', 'a') as outfile:
    for i in range(((3*L)//6),((4*L)//6)-1):
        json.dump(l[i], outfile)
        outfile.write('\n')     

with open('java5.json', 'a') as outfile:
    for i in range(((4*L)//6)-1,((5*L)//6)):
        json.dump(l[i], outfile)
        outfile.write('\n') 
                   
with open('java6.json', 'a') as outfile:
    for i in range(((5*L)//6),((6*L)//6)):
        json.dump(l[i], outfile)
        outfile.write('\n') 


In [6]:
# loading python to corpus to compute jsons
def load_corpus12(lang="python", verbose=False):
    lines = load_text_file(base_path / pathlib.Path(f"{lang}/{lang}_qid2all.txt"))


    record_list = list()
    for line in tqdm(lines, disable=not verbose):
        record_list.append(
            {name: text.strip(string.whitespace) for name, text in zip(["qid", "title", "body", "answer"], line.split("\t"))}
        )
            
    python_corpus_dataframe = pd.DataFrame(record_list)

    return python_corpus_dataframe

In [7]:
pd.set_option("display.max_columns", 10)

python_corpus_dataframe = load_corpus12(lang="python", verbose=True)
print(python_corpus_dataframe)

100%|██████████| 485827/485827 [00:01<00:00, 384492.44it/s]


             qid                                              title  \
0       27787264  pandas query throws error column name starts n...   
1       35651586                      psycopg cursor already closed   
2       30342145                    redemption setting display name   
3       45088774            removing words numbers text file python   
4       30408713                 python using x values draw picture   
...          ...                                                ...   
485822   3145720              accumulate state across tests py test   
485823  37748729                                 use tkinter method   
485824  19922939                       finding middle three numbers   
485825  35362859                   python convert string list split   
485826   8388607             way make work vim reindent python code   

                                                     body  \
0       trying perform query following dataframe works...   
1       using psycopg bun

In [8]:
#getting required jsons for python
import json
l=[]
for i in range(len(python_corpus_dataframe)):
  d={}
  d["index"]={"_id":python_corpus_dataframe.qid[i]}
  l.append(d)
  dbody={}
  dbody["title"]=python_corpus_dataframe.title[i]
  dbody["body"]=python_corpus_dataframe.body[i]
  dbody["answer"]=python_corpus_dataframe.answer[i]
  l.append(dbody)

L = len(l)

#with open("java.json","w") as studs:
 #for i in l:
  # json.dump(i,studs)
   
with open('python1t.json', 'a') as outfile:
    for i in range(0,(L//4)-1):
        json.dump(l[i], outfile)
        outfile.write('\n')

with open('python2t.json', 'a') as outfile:
    for i in range((L//4)-1,((2*L)//4)-1):
        json.dump(l[i], outfile)
        outfile.write('\n')

with open('python3t.json', 'a') as outfile:
    for i in range(((2*L)//4)-1,((3*L)//4)):
        json.dump(l[i], outfile)
        outfile.write('\n')
        

with open('python4t.json', 'a') as outfile:
    for i in range(((3*L)//4),((4*L)//4)):
        json.dump(l[i], outfile)
        outfile.write('\n')     




In [9]:
# loading python to corpus to compute jsons
def load_corpus13(lang="javascript", verbose=False):
    lines = load_text_file(base_path / pathlib.Path(f"{lang}/{lang}_qid2all.txt"))


    record_list = list()
    for line in tqdm(lines, disable=not verbose):
        record_list.append(
            {name: text.strip(string.whitespace) for name, text in zip(["qid", "title", "body", "answer"], line.split("\t"))}
        )
            
    javascript_corpus_dataframe = pd.DataFrame(record_list)

    return javascript_corpus_dataframe

In [10]:
pd.set_option("display.max_columns", 10)

javascript_corpus_dataframe = load_corpus12(lang="javascript", verbose=True)
print(javascript_corpus_dataframe)

100%|██████████| 1319382/1319382 [00:04<00:00, 282634.50it/s]


              qid                                              title  \
0        47457739                   delay checkform using javascript   
1        23068675             bind function elements dom page loaded   
2        44040197  calling javascript class method another method...   
3        30307229                              html calculate button   
4        25165834                    ajax call controller method mvc   
...           ...                                                ...   
1319377  44734296        regex allow capital letters beginning words   
1319378  37748727  javascript moving image created programmatical...   
1319379   6640980  click button html page direct user another htm...   
1319380  41943034         using switch statement create new variable   
1319381   8388605                    preventing js file called twice   

                                                      body  \
0        tried delay sec submit form like work still re...   
1        co

In [11]:
import json
l=[]
for i in range(len(javascript_corpus_dataframe)):
  d={}
  d["index"]={"_id":javascript_corpus_dataframe.qid[i]}
  l.append(d)
  dbody={}
  dbody["title"]=javascript_corpus_dataframe.title[i]
  dbody["body"]=javascript_corpus_dataframe.body[i]
  dbody["answer"]=javascript_corpus_dataframe.answer[i]
  l.append(dbody)

L = len(l)

#with open("java.json","w") as studs:
 #for i in l:
  # json.dump(i,studs)
   
with open('javascript1.json', 'a') as outfile:
    for i in range(0,(L//11)-1):
        json.dump(l[i], outfile)
        outfile.write('\n')

with open('javascript2.json', 'a') as outfile:
    for i in range((L//11)-1,((2*L)//11)-1):
        json.dump(l[i], outfile)
        outfile.write('\n')

with open('javascript3.json', 'a') as outfile:
    for i in range(((2*L)//11)-1,((3*L)//11)):
        json.dump(l[i], outfile)
        outfile.write('\n')
        

with open('javascript4.json', 'a') as outfile:
    for i in range(((3*L)//11),((4*L)//11)):
        json.dump(l[i], outfile)
        outfile.write('\n')     

with open('javascript5.json', 'a') as outfile:
    for i in range(((4*L)//11),((5*L)//11)):
        json.dump(l[i], outfile)
        outfile.write('\n') 
                   
with open('javascript6.json', 'a') as outfile:
    for i in range(((5*L)//11),((6*L)//11)-1):
        json.dump(l[i], outfile)
        outfile.write('\n') 
        
with open('javascript7.json', 'a') as outfile:
    for i in range(((6*L)//11)-1,((7*L)//11)-1):
        json.dump(l[i], outfile)
        outfile.write('\n') 

with open('javascript8.json', 'a') as outfile:
    for i in range(((7*L)//11)-1,((8*L)//11)-1):
        json.dump(l[i], outfile)
        outfile.write('\n') 
        
with open('javascript9.json', 'a') as outfile:
    for i in range(((8*L)//11)-1,((9*L)//11)):
        json.dump(l[i], outfile)
        outfile.write('\n') 
        
with open('javascript10.json', 'a') as outfile:
    for i in range(((9*L)//11),((10*L)//11)):
        json.dump(l[i], outfile)
        outfile.write('\n') 
        
with open('javascript11.json', 'a') as outfile:
    for i in range(((10*L)//11),((11*L)//11)):
        json.dump(l[i], outfile)
        outfile.write('\n') 

In [12]:
#push all jsons to indexes using terminal curl command (curl -s -H "Content-Type: application/json"  -XPOST http://localhost:9200/index_name/_bulk --data-binary @/Users/aaryamantpkatoch/Desktop/jsons/java6.json)

In [13]:
# uploading javacosidf to corpus
pd.set_option("display.max_columns", 10)

javacosidf_corpus_dataframe = load_corpus1(lang="java", verbose=True)
print(javacosidf_corpus_dataframe)

100%|██████████| 253441/253441 [00:00<00:00, 739358.75it/s]


            qid1      qid2           score  label
0           qid1      qid2           score  label
1        4252472  15194804  0.941838972542      0
2        4252472  18264178  0.801707464484      0
3        4252472  16225177  0.801707464484      1
4        4252472  16445238  0.769576496381      0
...          ...       ...             ...    ...
253436  15615881  18456100  0.537130501589      0
253437  15615881  31407382  0.536373912746      0
253438  15615881  26271786  0.532262831449      0
253439  15615881  11008033  0.524612763032      0
253440  15615881  42065871  0.522841280226      0

[253441 rows x 4 columns]


In [14]:
#fixing the index
javacosidf_corpus_dataframe = javacosidf_corpus_dataframe.drop(labels=0, axis=0)
javacosidf_corpus_dataframe.reset_index(drop=True, inplace=True)

In [15]:
print(javacosidf_corpus_dataframe)

            qid1      qid2           score label
0        4252472  15194804  0.941838972542     0
1        4252472  18264178  0.801707464484     0
2        4252472  16225177  0.801707464484     1
3        4252472  16445238  0.769576496381     0
4        4252472  17233226  0.720987113025     0
...          ...       ...             ...   ...
253435  15615881  18456100  0.537130501589     0
253436  15615881  31407382  0.536373912746     0
253437  15615881  26271786  0.532262831449     0
253438  15615881  11008033  0.524612763032     0
253439  15615881  42065871  0.522841280226     0

[253440 rows x 4 columns]


In [16]:
conda install elasticsearch -y


Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Retrieving notices: ...working... done

Note: you may need to restart the kernel to use updated packages.


In [17]:
from elasticsearch import Elasticsearch
es=Elasticsearch()

In [19]:
#get all the indexes 
index_list = []

for index in es.indices.get('*'):
    if "0" and "." not in index:
        index_list.append(index)

index_list

['javabm',
 'javadlm',
 'javascriptbm',
 'javascriptdlm',
 'javascripttf',
 'javatf',
 'pythonbm',
 'pythondlm',
 'pythontf',
 'test']

In [20]:
#get each qid1 for java from javacosidf
each_qid1_cosidf = javacosidf_corpus_dataframe["qid1"].unique()
print(each_qid1_cosidf)

['4252472' '1662766' '3302177' ... '15405288' '22932451' '15615881']


In [21]:
#define ranking function
def ranking(qid1,qid1_title,ratings):
    _search={
        "requests":[
            {
                "id":str(qid1),
                "request":{
                    "query":{
                        "bool":{
                            "must_not":{
                                "match":{
                                    "_id":qid1
                                }
                            },
                            "should":[
                            {
                                "match":{
                                    "title":{
                                        "query":qid1_title,
                                        "boost":3.0,
                                        "analyzer":"my_analyzer"
                                    }
                                    }
                            },
                            {   
                                "match":{
                                    "body":{
                                        "query":qid1_title,
                                        "boost":0.5,
                                        "analyzer":"my_analyzer"   
                                    }}},
                                {
                                 "match":{
                                    "answer":{
                                        "query":qid1_title,
                                        "boost":0.5,
                                        "analyzer":"my_analyzer"   
                                    }}}
                            ]}}
                },
                "ratings": ratings
            }
        ],
        "metric":{
            "dcg":{
                "k":10,
                "normalize":True
            }
        }
    }
    return _search
        
                                    

In [22]:
#java bm25
#create ratings json for each qid1
for k in each_qid1_cosidf:
    ratings=[]
    for j in range(len(javacosidf_corpus_dataframe)):
            if(javacosidf_corpus_dataframe.qid1[j]==k):
                        ratings.append({"_index":"javabm","_id":javacosidf_corpus_dataframe.qid2[j],"rating":int(javacosidf_corpus_dataframe.label[j])})
                        ratings=ratings[0:30]
    import json
    with open(f'{k}.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in ratings) +
            '\n]\n')
   


In [23]:
#java bm25
#get result and ndcg
import json
from elasticsearch import Elasticsearch
es = Elasticsearch()
ndcg_list=[];result_list=[]
for i in each_qid1_cosidf:
    qid1_title=es.get("javabm", doc_type='_doc', id=i)['_source']['title']
    
    f = open(f'{i}.json')
    data = json.load(f)
    _search = ranking(i, qid1_title, data) # Figure 2
    result = es.rank_eval(index="javabm",body=_search)
    result_list.append(result)
        
    ndcg = result['metric_score']
    ndcg_list.append(ndcg)
 #   except OSError:
  #      pass

print(ndcg_list)   

[0.38685280723454163, 1.0, 1.0, 0.3333333333333333, 0.6309297535714574, 1.0, 0.0, 0.6309297535714574, 0.6309297535714574, 0.3562071871080222, 0.0, 0.7122630665145961, 0.6309297535714574, 0.0, 0.19342640361727081, 0.0, 1.0, 1.0, 1.0, 0.5, 0.0, 0.3706656904013185, 1.0, 0.0, 0.6309297535714574, 0.0, 1.0, 1.0, 1.0, 0.3154648767857287, 0.30102999566398114, 0.6309297535714574, 1.0, 0.38685280723454163, 0.5706417189553201, 0.30102999566398114, 0.5, 0.0, 0.0, 0.0, 0.6309297535714574, 0.6309297535714574, 0.0, 0.5, 0.0, 0.0, 1.0, 1.0, 0.0, 0.3562071871080222, 0.0, 0.5, 0.0, 0.6309297535714574, 0.0, 0.0, 0.5, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.38685280723454163, 1.0, 0.3154648767857287, 1.0, 1.0, 0.38685280723454163, 0.7039180890341348, 0.0, 1.0, 0.0, 1.0, 0.6131471927654585, 0.0, 1.0, 0.0, 0.3562071871080222, 0.2184074368181642, 0.5, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.6131471927654585, 1.0, 1.0, 0.0, 0.0, 0.43067655807339306, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.3562

In [24]:
#java bm25
#push result values to result json file
import json
with open('result_list_java_bm25.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in result_list) +
            '\n]\n')
   

In [25]:
#java bm25
#push ndcg scores to ndcg json file
import json
with open("ndcg_score_java_bm25.json", 'w') as fp:
            fp.write(
            '[\n' +
            ','.join(json.dumps(l) for l in ndcg_list) +
            '\n]')


In [26]:
#java bm25
#calculate avg score for java bm25
sum=0
for i in ndcg_list:
    sum=sum+i
average_ndcg=sum/len(ndcg_list)
print(average_ndcg)

0.4261403376059393


In [27]:
#java tf-idf
#create ratings json for each qid1

for k in each_qid1_cosidf:
    ratings=[]
    for j in range(len(javacosidf_corpus_dataframe)):
            if(javacosidf_corpus_dataframe.qid1[j]==k):
                        ratings.append({"_index":"javatf","_id":javacosidf_corpus_dataframe.qid2[j],"rating":int(javacosidf_corpus_dataframe.label[j])})
                        ratings=ratings[0:30]
    import json
    with open(f'{k}tf.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in ratings) +
            '\n]\n')

In [28]:
#java tf-idf
#get result and ndcg
import json
from elasticsearch import Elasticsearch
es = Elasticsearch()
ndcg_list_tf=[];result_list_tf=[]
for i in each_qid1_cosidf:
    qid1_title=es.get("javatf", doc_type='_doc', id=i)['_source']['title']
    
    f = open(f'{i}tf.json')
    data = json.load(f)
    _search = ranking(i, qid1_title, data) # Figure 2
    result = es.rank_eval(index="javatf",body=_search)
    result_list_tf.append(result)
        
    ndcg = result['metric_score']
    ndcg_list_tf.append(ndcg)
 #   except OSError:
  #      pass

print(ndcg_list_tf) 

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2640681225725909, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.43067655807339306, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [29]:
#java tf-idf
#calculate avg score for java tf-idf
sum=0
for i in ndcg_list_tf:
    sum=sum+i
average_ndcg_tf=sum/len(ndcg_list_tf)
print(average_ndcg_tf)

0.012715244110966339


In [30]:
# push result of java -tfidf to result json
import json
with open('result_list_java_tf.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in result_list_tf) +
            '\n]\n')

In [31]:
# push ndcg score of java tf-idf to ndcg json
import json
with open('ndcg_list_java_tfidf.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in ndcg_list_tf) +
            '\n]\n')

In [32]:
#java Dirichlet
#create ratings json for each qid1


for k in each_qid1_cosidf:
    ratings=[]
    for j in range(len(javacosidf_corpus_dataframe)):
            if(javacosidf_corpus_dataframe.qid1[j]==k):
                        ratings.append({"_index":"javadlm","_id":javacosidf_corpus_dataframe.qid2[j],"rating":int(javacosidf_corpus_dataframe.label[j])})
                        ratings=ratings[0:30]
    import json
    with open(f'{k}dlmjava.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in ratings) +
            '\n]\n')

KeyboardInterrupt: 

In [None]:
#java Dirichlet
#get result and ndcg

import json
from elasticsearch import Elasticsearch
es = Elasticsearch()
ndcg_list_java_lmd=[];result_list_java_lmd=[]
for i in each_qid1_cosidf:
    qid1_title=es.get("javadlm", doc_type='_doc', id=i)['_source']['title']
    
    f = open(f'{i}dlmjava.json')
    data = json.load(f)
    _search = ranking(i, qid1_title, data) # Figure 2
    result = es.rank_eval(index="javadlm",body=_search)
    result_list_java_lmd.append(result)
        
    ndcg = result['metric_score']
    ndcg_list_java_lmd.append(ndcg)
 #   except OSError:
  #      pass

print(ndcg_list_java_lmd) 

In [None]:
# push result of java Dirichlet to result json

import json
with open('result_list_java_DLM.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in result_list_java_lmd) +
            '\n]\n')

In [None]:
# push ndcg of java Dirichlet to result json

import json
with open("ndcg_score_java_DLM.json", 'w') as fp:
            fp.write(
            '[\n' +
            ','.join(json.dumps(l) for l in ndcg_list_java_lmd) +
            '\n]')

In [None]:
# get average ndcg score for java Dirichlet

sum=0
for i in ndcg_list_java_lmd:
    sum=sum+i
average_ndcg_lmd=sum/len(ndcg_list_java_lmd)
print(average_ndcg_lmd)

In [None]:
#javascript

In [None]:
###javascript
def load_corpus2(lang="javascript", verbose=False):
    lines = load_text_file(base_path / pathlib.Path(f"{lang}/{lang}_cosidf.txt"))


    record_list = list()
    for line in tqdm(lines, disable=not verbose):
        record_list.append(
            {name: text.strip(string.whitespace) for name, text in zip(["qid1", "qid2", "score", "label"], line.split("\t"))}
        )
            
    javascriptcosidf_corpus_dataframe = pd.DataFrame(record_list)

    return javascriptcosidf_corpus_dataframe

In [None]:
pd.set_option("display.max_columns", 10)

javascriptcosidf_corpus_dataframe = load_corpus2(lang="javascript", verbose=True)
print(javascriptcosidf_corpus_dataframe)

In [None]:
javascriptcosidf_corpus_dataframe = javascriptcosidf_corpus_dataframe.drop(labels=0, axis=0)
javascriptcosidf_corpus_dataframe.reset_index(drop=True, inplace=True)

In [None]:
print(javascriptcosidf_corpus_dataframe)

In [None]:
# get each qid1 from javascriptcosidf
each_qid1_cosidf_javascript = javascriptcosidf_corpus_dataframe["qid1"].unique()
print(each_qid1_cosidf_javascript)

In [None]:
# javascript bm25
#get ratings json for each qid1
for k in each_qid1_cosidf_javascript:
    ratings=[]
    for j in range(len(javascriptcosidf_corpus_dataframe)):
            if(javascriptcosidf_corpus_dataframe.qid1[j]==k):
                        ratings.append({"_index":"javascriptbm","_id":javascriptcosidf_corpus_dataframe.qid2[j],"rating":int(javascriptcosidf_corpus_dataframe.label[j])})
                        ratings=ratings[0:30]
    import json
    with open(f'{k}bmjavascript.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in ratings) +
            '\n]\n')

In [None]:
# javascript bm25
#get result and ndcg scores for each qid1
import json
from elasticsearch import Elasticsearch
es = Elasticsearch()
ndcg_list_javascript_bm=[];result_list_javascript_bm=[]
for i in each_qid1_cosidf_javascript:
    qid1_title=es.get("javascriptbm", doc_type='_doc', id=i)['_source']['title']
    
    f = open(f'{i}bmjavascript.json')
    data = json.load(f)
    _search = ranking(i, qid1_title, data) # Figure 2
    result = es.rank_eval(index="javascriptbm",body=_search)
    result_list_javascript_bm.append(result)
        
    ndcg = result['metric_score']
    ndcg_list_javascript_bm.append(ndcg)
 #   except OSError:
  #      pass

print(ndcg_list_javascript_bm) 

In [None]:
# javascript bm25
#push results to results json 

import json
with open('result_list_javascript_bm.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in result_list_javascript_bm) +
            '\n]\n')

In [None]:
# javascript bm25
#push ndcg scores to ndcg json 
import json
with open("ndcg_score_javascript_bm.json", 'w') as fp:
            fp.write(
            '[\n' +
            ','.join(json.dumps(l) for l in ndcg_list_javascript_bm) +
            '\n]')

In [None]:
# javascript bm25
#get average ndcg score
sum=0
for i in ndcg_list_javascript_bm:
    sum=sum+i
average_ndcg_js_bm=sum/len(ndcg_list_javascript_bm)
print(average_ndcg_js_bm)

In [None]:
#javascript tfidf
#get ratings json for each qid1
for k in each_qid1_cosidf_javascript:
    ratings=[]
    for j in range(len(javascriptcosidf_corpus_dataframe)):
            if(javascriptcosidf_corpus_dataframe.qid1[j]==k):
                        ratings.append({"_index":"javascripttf","_id":javascriptcosidf_corpus_dataframe.qid2[j],"rating":int(javascriptcosidf_corpus_dataframe.label[j])})
                        ratings=ratings[0:30]
    import json
    with open(f'{k}tfjavascript.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in ratings) +
            '\n]\n')

In [None]:
# javascript tfidf
#get result and ndcg scores for each qid1

import json
from elasticsearch import Elasticsearch
es = Elasticsearch()
ndcg_list_javascript_tf=[];result_list_javascript_tf=[]
for i in each_qid1_cosidf_javascript:
    qid1_title=es.get("javascripttf", doc_type='_doc', id=i)['_source']['title']
    
    f = open(f'{i}tfjavascript.json')
    data = json.load(f)
    _search = ranking(i, qid1_title, data) # Figure 2
    result = es.rank_eval(index="javascripttf",body=_search)
    result_list_javascript_tf.append(result)
        
    ndcg = result['metric_score']
    ndcg_list_javascript_tf.append(ndcg)
 #   except OSError:
  #      pass

print(ndcg_list_javascript_tf)

In [None]:
# javascript tfidf
#push results to results json 
import json
with open('result_list_javascript_tf.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in result_list_javascript_tf) +
            '\n]\n')

In [None]:
# javascript tfidf
#push ndcg scores to ndcg json 
import json
with open("ndcg_score_javascript_tf.json", 'w') as fp:
            fp.write(
            '[\n' +
            ','.join(json.dumps(l) for l in ndcg_list_javascript_tf) +
            '\n]')

In [None]:
# javascript tfidf
#get average ndcg score
sum=0
for i in ndcg_list_javascript_tf:
    sum=sum+i
average_ndcg_js_tf=sum/len(ndcg_list_javascript_tf)
print(average_ndcg_js_tf)

In [None]:
#javascript Dirichlet
#get ratings json for each qid1
for k in each_qid1_cosidf_javascript:
    ratings=[]
    for j in range(len(javascriptcosidf_corpus_dataframe)):
            if(javascriptcosidf_corpus_dataframe.qid1[j]==k):
                        ratings.append({"_index":"javascriptdlm","_id":javascriptcosidf_corpus_dataframe.qid2[j],"rating":int(javascriptcosidf_corpus_dataframe.label[j])})
                        ratings=ratings[0:30]
    import json
    with open(f'{k}dlmjavascript.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in ratings) +
            '\n]\n')

In [None]:
# javascript Dirichlet
#get result and ndcg scores for each qid1

import json
from elasticsearch import Elasticsearch
es = Elasticsearch()
ndcg_list_javascript_dlm=[];result_list_javascript_dlm=[]
for i in each_qid1_cosidf_javascript:
    qid1_title=es.get("javascriptdlm", doc_type='_doc', id=i)['_source']['title']
    
    f = open(f'{i}dlmjavascript.json')
    data = json.load(f)
    _search = ranking(i, qid1_title, data) # Figure 2
    result = es.rank_eval(index="javascriptdlm",body=_search)
    result_list_javascript_dlm.append(result)
        
    ndcg = result['metric_score']
    ndcg_list_javascript_dlm.append(ndcg)
 #   except OSError:
  #      pass

print(ndcg_list_javascript_dlm)

In [None]:
# javascript Dirichlet
#push results to results json 
import json
with open('result_list_javascript_dlm.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in result_list_javascript_dlm) +
            '\n]\n')

In [None]:
# javascript Dirichlet
#push ndcg scores to ndcg json 
import json
with open("ndcg_score_javascript_dlm.json", 'w') as fp:
            fp.write(
            '[\n' +
            ','.join(json.dumps(l) for l in ndcg_list_javascript_dlm) +
            '\n]')

In [None]:
# javascript Dirichlet
#get average ndcg score
sum=0
for i in ndcg_list_javascript_dlm:
    sum=sum+i
average_ndcg_js_dlm=sum/len(ndcg_list_javascript_dlm)
print(average_ndcg_js_dlm)

In [None]:
#python


In [None]:
def load_corpus3(lang="python", verbose=False):
    lines = load_text_file(base_path / pathlib.Path(f"{lang}/{lang}_cosidf.txt"))


    record_list = list()
    for line in tqdm(lines, disable=not verbose):
        record_list.append(
            {name: text.strip(string.whitespace) for name, text in zip(["qid1", "qid2", "score", "label"], line.split("\t"))}
        )
            
    pythoncosidf_corpus_dataframe = pd.DataFrame(record_list)

    return pythoncosidf_corpus_dataframe


In [None]:
pd.set_option("display.max_columns", 10)

pythoncosidf_corpus_dataframe = load_corpus3(lang="python", verbose=True)
print(pythoncosidf_corpus_dataframe)

In [None]:
#fixing corpus index
pythoncosidf_corpus_dataframe = pythoncosidf_corpus_dataframe.drop(labels=0, axis=0)
pythoncosidf_corpus_dataframe.reset_index(drop=True, inplace=True)

In [None]:
print(pythoncosidf_corpus_dataframe)

In [None]:
#getting ech qid1 from pythoncosidf
each_qid1_cosidf_python = pythoncosidf_corpus_dataframe["qid1"].unique()
print(each_qid1_cosidf_python)

In [None]:
#python bm25
##get ratings json for each qid1
for k in each_qid1_cosidf_python:
    ratings=[]
    for j in range(len(pythoncosidf_corpus_dataframe)):
            if(pythoncosidf_corpus_dataframe.qid1[j]==k):
                        ratings.append({"_index":"pythonbm","_id":pythoncosidf_corpus_dataframe.qid2[j],"rating":int(pythoncosidf_corpus_dataframe.label[j])})
                        ratings=ratings[0:30]
    import json
    with open(f'{k}bmpython.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in ratings) +
            '\n]\n')

In [None]:
# python bm25
#get result and ndcg scores for each qid1
import json
from elasticsearch import Elasticsearch
es = Elasticsearch()
ndcg_list_python_bm=[];result_list_python_bm=[]
for i in each_qid1_cosidf_python:
    qid1_title=es.get("pythonbm", doc_type='_doc', id=i)['_source']['title']
    
    f = open(f'{i}bmpython.json')
    data = json.load(f)
    _search = ranking(i, qid1_title, data) # Figure 2
    result = es.rank_eval(index="pythonbm",body=_search)
    result_list_python_bm.append(result)
        
    ndcg = result['metric_score']
    ndcg_list_python_bm.append(ndcg)
 #   except OSError:
  #      pass

print(ndcg_list_python_bm)

In [None]:
# python bm25
#push results to results json 
import json
with open('result_list_python_bm.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in result_list_python_bm) +
            '\n]\n')

In [None]:
# python bm25
#push ndcg_scores to ndcg json 
import json
with open("ndcg_score_python_bm.json", 'w') as fp:
            fp.write(
            '[\n' +
            ','.join(json.dumps(l) for l in ndcg_list_python_bm) +
            '\n]')

In [None]:
# python bm25
#get average ndcg score
sum=0
for i in ndcg_list_python_bm:
    sum=sum+i
average_ndcg_python_bm=sum/len(ndcg_list_python_bm)
print(average_ndcg_python_bm)

In [None]:
#python tfidf
##get ratings json for each qid1
for k in each_qid1_cosidf_python:
    ratings=[]
    for j in range(len(pythoncosidf_corpus_dataframe)):
            if(pythoncosidf_corpus_dataframe.qid1[j]==k):
                        ratings.append({"_index":"pythontf","_id":pythoncosidf_corpus_dataframe.qid2[j],"rating":int(pythoncosidf_corpus_dataframe.label[j])})
                        ratings=ratings[0:30]
    import json
    with open(f'{k}tfpython.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in ratings) +
            '\n]\n')

In [None]:
# python tfidf
#get result and ndcg scores for each qid1
import json
from elasticsearch import Elasticsearch
es = Elasticsearch()
ndcg_list_python_tf=[];result_list_python_tf=[]
for i in each_qid1_cosidf_python:
    qid1_title=es.get("pythontf", doc_type='_doc', id=i)['_source']['title']
    
    f = open(f'{i}tfpython.json')
    data = json.load(f)
    _search = ranking(i, qid1_title, data) # Figure 2
    result = es.rank_eval(index="pythontf",body=_search)
    result_list_python_tf.append(result)
        
    ndcg = result['metric_score']
    ndcg_list_python_tf.append(ndcg)
 #   except OSError:
  #      pass

print(ndcg_list_python_tf)

In [None]:
# python tfidf
#push results to results json 
import json
with open('result_list_python_tf.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in result_list_python_tf) +
            '\n]\n')

In [None]:
# python tfidf
#push ndcg_scores to ndcg json 
import json
with open("ndcg_score_python_tf.json", 'w') as fp:
            fp.write(
            '[\n' +
            ','.join(json.dumps(l) for l in ndcg_list_python_tf) +
            '\n]')

In [None]:
# python tfidf
#get average ndcg score
sum=0
for i in ndcg_list_python_tf:
    sum=sum+i
average_ndcg_python_tf=sum/len(ndcg_list_python_tf)
print(average_ndcg_python_tf)

In [None]:
#python Dirichlet
##get ratings json for each qid1
for k in each_qid1_cosidf_python:
    ratings=[]
    for j in range(len(pythoncosidf_corpus_dataframe)):
            if(pythoncosidf_corpus_dataframe.qid1[j]==k):
                        ratings.append({"_index":"pythondlm","_id":pythoncosidf_corpus_dataframe.qid2[j],"rating":int(pythoncosidf_corpus_dataframe.label[j])})
                        ratings=ratings[0:30]
    import json
    with open(f'{k}dlmpython.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in ratings) +
            '\n]\n')

In [None]:
# python Dirichlet
#get result and ndcg scores for each qid1
import json
from elasticsearch import Elasticsearch
es = Elasticsearch()
ndcg_list_python_dlm=[];result_list_python_dlm=[]
for i in each_qid1_cosidf_python:
    qid1_title=es.get("pythondlm", doc_type='_doc', id=i)['_source']['title']
    
    f = open(f'{i}dlmpython.json')
    data = json.load(f)
    _search = ranking(i, qid1_title, data) # Figure 2
    result = es.rank_eval(index="pythondlm",body=_search)
    result_list_python_dlm.append(result)
        
    ndcg = result['metric_score']
    ndcg_list_python_dlm.append(ndcg)
 #   except OSError:
  #      pass

print(ndcg_list_python_dlm)

In [None]:
# python Dirichlet
#push results to results json 
import json
with open('result_list_python_dlm.json', 'w') as fp:
            fp.write(
            '[\n' +
            ',\n'.join(json.dumps(l) for l in result_list_python_dlm) +
            '\n]\n')

In [None]:
# python Dirichlet
#push ndcg_scores to ndcg json 
import json
with open("ndcg_score_python_dlm.json", 'w') as fp:
            fp.write(
            '[\n' +
            ','.join(json.dumps(l) for l in ndcg_list_python_dlm) +
            '\n]')

In [None]:
# python Dirichlet
#get average ndcg score
sum=0
for i in ndcg_list_python_dlm:
    sum=sum+i
average_ndcg_python_dlm=sum/len(ndcg_list_python_dlm)
print(average_ndcg_python_dlm)