In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
df_meta = pd.read_csv("metadata/metadata.csv")

In [None]:
df_meta['title_abstract'] = df_meta['title'] + ' ' + df_meta['abstract']

In [None]:
from nltk.tokenize import word_tokenize
from __future__ import division 
import math

In [None]:
def tokenize(line, tokenizer=word_tokenize):
    utf_line = line.lower()
    return [token for token in tokenizer(utf_line)]

In [None]:
import nltk
import re

stopwords = set(nltk.corpus.stopwords.words('english')) 
stemmer = nltk.stem.PorterStemmer()
def extract_and_tokenize_terms(doc):
    terms = []
    for token in tokenize(doc):
        if token not in stopwords:
            if not re.search(r'\d',token) and not re.search(r'[^A-Za-z-]', token): 
                terms.append(stemmer.stem(token.lower()))
    return terms

In [None]:
# with open('corpusData.json') as f:
#     documents = json.load(f)

In [None]:
documents = dict(zip(list(df_meta['cord_uid']),list(df_meta['title_abstract'])))

In [None]:
documents_tokenize = {k:extract_and_tokenize_terms(v) for k,v in documents.items() if type(v) == str and pd.notna(k)}

In [None]:
from collections import defaultdict
    
inverted_index = defaultdict(set)
for docid, terms in documents_tokenize.items():
    for term in terms:
        inverted_index[term].add(docid)

In [None]:
num_docs = len(documents_tokenize)
avg_doc_len = sum([len(doc) for doc in documents_tokenize.values()])/num_docs


def tf_idf_score(param_k1,param_b,term,docid):  
    
    ft = len(inverted_index[term]) 
    term = stemmer.stem(term.lower())
    fdt =  documents_tokenize[docid].count(term)
    
    inverse_doc_freq = math.log((num_docs - ft + 0.5)/(ft+0.5))
    tf_comp = (((param_k1 + 1)*fdt)/(param_k1*((1-param_b) + param_b*(len(documents_tokenize[docid])/avg_doc_len))+fdt))
    
    return inverse_doc_freq * tf_comp

def create_tf_idf(param_k1,param_b):
    tf_idf = defaultdict(dict)
    for term in set(inverted_index.keys()):
        for docid in inverted_index[term]:
            tf_idf[term][docid] = tf_idf_score(param_k1,param_b,term,docid)
    return tf_idf

tf_idf = create_tf_idf(1.5, 0.5)

In [None]:
def get_qtf_comp(k3,term,fqt):
    return ((k3+1)*fqt[term])/(k3 + fqt[term]) 

def retr_docs(query,result_count):
    query_terms = [stemmer.stem(term.lower()) for term in query.split() if term not in stopwords]    
    fqt = {} 
    for term in query_terms:
        fqt[term] = fqt.get(term,0) + 1
        
    scores = {}
    
    for word in fqt.keys():
        for document in inverted_index[word]:
            scores[document] = scores.get(document,0) + (tf_idf[word][document]*get_qtf_comp(0,word,fqt))
    
    return sorted(scores.items(),key = lambda x : x[1] , reverse=True)[:result_count]

In [None]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET

tree = ET.parse('C:/Users/user/Downloads/Practicum_Test/rnd5_topics.xml')
root = tree.getroot()

run = []
for element in root.iter('query'):
    value = element.text
    run = run + retr_docs(value, 1000)

In [None]:
df = pd.DataFrame(run)
df.rename(columns = {0:'docid',1:'score'}, inplace = True)
df['rank'] = [None]*len(df)
df['topicid'] = [None]*len(df)
# df['result'] = [None]*len(df)
df['Q0'] = ['Q0']*len(df)
df['run_tag'] = ['dcu']*len(df)
id_=1
rank = 1
for i in range(len(df)):
    df['topicid'][i] = id_
    df['rank'][i] = rank
    rank+=1
    if (i+1)%1000==0:
        id_+=1
        rank = 1

In [None]:
df1=df[['topicid','Q0','docid','rank','score','run_tag']]

In [None]:
df1.to_csv('TA1000_result.csv', index=False)

In [None]:
df2 = pd.read_csv('TA1000_result.csv') 

df_deduplicated = df2.drop_duplicates(subset=['topicid', 'docid'])

print(df_deduplicated)
df_deduplicated.to_csv('deduplicated_table_TA_1000.csv', index=False)