In [1]:
from elasticsearch import Elasticsearch
import re
import os
import pandas as pd
import string
from elasticsearch_dsl import Search
import json

In [2]:
es = Elasticsearch("http://localhost:9200", timeout=60)

In [3]:
os.chdir("/Users/Dibble/Desktop/homework-1-Evan-Chan-NEU-main/IR_data/AP_DATA")
files = os.listdir("/Users/Dibble/Desktop/homework-1-Evan-Chan-NEU-main/IR_data/AP_DATA/ap89_collection")
files.remove("readme")

In [4]:
# function (also used in parser/indexer) to get list of docnums
def get_docnums():
    test_keys = []
    for i in files[:364]:
        current = open(str("/Users/Dibble/Desktop/homework-1-Evan-Chan-NEU-main/IR_data/AP_DATA/ap89_collection/"+i), encoding = "latin-1")
        text = current.read()
        keys = re.findall("<DOCNO>(.*)</DOCNO>", text)
        for j in keys:
            test_keys.append(j)
    return test_keys

In [5]:
# helper function for get_queries() to return the number of each query
def get_queries_nums(queries_list):
    queries_nums = []
    for line in queries_list:
        num = line.split()[0]
        queries_nums.append(num)
    return queries_nums

In [6]:
# helper function for get_queries() to return necessary text of each query with
def get_queries_text(queries_list):
    queries_nums = get_queries_nums(queries_list)
    query_stop_words = ["Document", "will", "must", "discuss", 
    "report", "include", "describe", "identify", "a", "an", "as",
    "and", "the", "to", "or", "either", "of", "by", "in", "with", 
    "about", "some", "any", "its", "even", "other", "which",
    "being", "certain", "has"]
    remove_words = queries_nums + query_stop_words
    queries_text = []
    for i in queries_list:
        x = i.split()
        queries_text.append(" ".join(a if a not in remove_words else '' for a in x))
    return queries_text

In [7]:
# function to get dataframe with query number and query text
def get_queries_df():
    queries_df = pd.DataFrame()
    with open("/Users/Dibble/Desktop/homework-1-Evan-Chan-NEU-main/IR_data/AP_DATA/query_desc.51-100.short.txt") as queries_file:
        queries_list = queries_file.read()
        queries_list = re.sub("-", " ", queries_list)
        queries_list = queries_list.translate(str.maketrans('', '', string.punctuation))
        queries_list = queries_list.split("\n")
        queries_nums = get_queries_nums(queries_list)
        queries_text = get_queries_text(queries_list)
        queries_df["QueryNumber"] = queries_nums
        queries_df["QueryText"] = queries_text
    return queries_df

#queries_df = get_queries_df()
#print(queries_df)

In [8]:
# function to get list of queries 
def get_queries_words(queries_df):
    query_words_list = []
    for query in queries_df.iloc[:, 1]:
        query_words_list.append(query)
    return query_words_list

#queries_df = get_queries_df()
#query_words_list = get_queries_words(queries_df)
#print(query_words_list)

In [9]:
# helper function to search term vector api result for query word and return its frequency within a document
def get_tf_num(term, results):
    tf_num = 0
    if results.__contains__("TEXT") is True:
        if term in results["TEXT"]["terms"]:
            tf = results["TEXT"]["terms"][term]["term_freq"]
            tf_num += tf
            return tf_num
        else:
            return tf_num
    else:
        return tf_num

In [10]:
# helper function to retrieve dictionary of query words tf within a document
def get_docnum_tf_dict(docnum, query):
    q_word_dict = {}
    results = es.termvectors(index="ap89_index4",
                        id=str(docnum),
                        body={
                            "fields": ["TEXT"],
                            "term_statistics": True,
                            "field_statistics": True
                        })["term_vectors"]
    for term in query:
        q_word_dict[term] = get_tf_num(term, results)
    
    return q_word_dict

In [11]:
# function to retrieve term frequency values of words in all 25 queries in each of the 84678 documents
# returns dictionary of {docnum : {q_word : tf, q_word : tf}, docnum : {q_word : tf, q_word : tf}, etc.}
def create_tf_dict():
    queries_df = get_queries_df()
    q_nums_list = queries_df.iloc[:, 0]
    q_words_list = queries_df.iloc[:, 1]
    test_keys = get_docnums()

    tf_dict_list = []
    for list in q_words_list[:25]:          
        query = list.split()
        tf_dict = {}
        for docnum in test_keys[:84678]:
            tf_dict[docnum] = get_docnum_tf_dict(docnum, query)
        tf_dict_list.append(tf_dict)
    
    return tf_dict_list

In [17]:
# helper function to get the doc length of a given doc
def get_doc_length(doc):
    doc_length = 0
   
    results = es.termvectors(index="ap89_index4",
                        id=str(doc),
                        body={
                            "fields": ["TEXT"],
                            "term_statistics": True,
                            "field_statistics": True
                        })["term_vectors"]

    if results.__contains__("TEXT") is True:
        for term in results["TEXT"]["terms"]:
            tf_val = 1 * (results["TEXT"]["terms"][term]["term_freq"])
            doc_length += tf_val
        return doc_length
    else:
        return doc_length

In [13]:
# helper function to get dataframe of okapi_tf scores for 1 query for all documents
def get_okapi_df(tf_dict, count):
 
   doc_length_list = []
   for doc in tf_dict.keys():
      doc_length = get_doc_length(doc)
      doc_length_list.append(doc_length)
   
   doc_id_list = []
   for doc in tf_dict.keys():
      doc_id_list.append(doc)

   test_keys = get_docnums()
   corpus_size = len(test_keys)
   avg_doc_length = (sum(doc_length_list))/corpus_size

   queries_df = get_queries_df()
   q_n_list = queries_df["QueryNumber"].tolist()

   okapi_tf_score = []
   q_tf_list = [i for i in tf_dict.values()]
   for index, list in enumerate(q_tf_list):
      current_list = []
      okapi_score = 0
      for tf in list.values():
         doc_length = doc_length_list[index]
         d = (doc_length/avg_doc_length)
         denom = tf + 0.5 + 1.5 * d
         okapi_score += (tf/denom)
         current_list.append(okapi_score)
      okapi_tf_score.append(sum(current_list))
   
   okapi_df = pd.DataFrame()
   okapi_df = okapi_df.reset_index()
   okapi_df["QUERYNUM"] = [q_n_list[count]] * len(doc_id_list)
   okapi_df["DOCNO"] = doc_id_list
   okapi_df["OKAPI"] = okapi_tf_score
   result_df = okapi_df.sort_values(by="OKAPI", ascending=False)
   result_df = result_df.iloc[:1000]

   return result_df

In [18]:
def okapi_tf():
    tf_dict_list = create_tf_dict() 

    okapi_results_list = []
    count = 0
    for tf_dict in tf_dict_list:
        result_df = get_okapi_df(tf_dict, count)
        count += 1
        okapi_results_list.append(result_df)
    
    for df in okapi_results_list:
        query_number_result = df["QUERYNUM"].tolist()
        query_result_docnums = df["DOCNO"].tolist()
        query_result_scores = df["OKAPI"].tolist()
        with open('/Users/Dibble/Desktop/homework-1-Evan-Chan-NEU-main/Okapi_TF_results.txt', 'a') as queryResults:
            rank = 1
            j = 0
            while j in range(len(query_result_docnums)):
                queryResults.write('%s Q0 %s %s %s Exp\n' % (query_number_result[j], query_result_docnums[j], rank, query_result_scores[j]))
                rank += 1
                j += 1
        queryResults.close()

okapi_tf()