In [2]:
import nltk
import json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import re
from argparse import ArgumentParser
from collections import defaultdict


In [3]:
#removes stop words from a given string
def remove_stopwords(text, lst_stopwords):
    words = text.split()
    filtered_sentence = [w for w in words if not w.lower() in lst_stopwords]
    clean_text = ' '.join(filtered_sentence).strip()
    return clean_text

In [4]:
#
def preprocess_answers(answer_dict):
    stop_words = stopwords.words('english')
    for item in answer_dict:
        text = item['Text']
        text = remove_stopwords(text, stop_words)
        text = text.lower()
        text = remove_tags(BeautifulSoup(text, "html.parser"))
        text = re.sub(r'[^\w\s]', '',text)
        
        item['Text'] = text
    return answer_dict

In [5]:
def remove_tags(soup):
    for data in soup:
        if hasattr(data, 'decompose'):
            data.decompose()
    return ' '.join(soup.stripped_strings)

In [6]:
def preprocess_query_remove_html(data):
    for item in data:
        text = item['Text']
        #print(item['Id'])
        text = remove_tags(BeautifulSoup(text, "html.parser"))
        item['Text'] = text
    print("Tags removed")

In [7]:
def tokenize_doc(doc):
    #turn text into list of tokens
    text = doc['Text']
    text = text.split()
    return text


In [8]:
def doc2vec(doc):
    #combine title and body for a better search space. keep doc id intact
    
    text = doc['Text']
    text = text.split()
    return {doc['Id'] : text}


In [9]:
#returns doc id : string representation of the question.
def answer_reduce(data):
    answers = {}
    for doc in data:
        doc2vec(doc)
        answers.update(doc2vec(doc))
    return answers

In [10]:
def generate_corpus(answers):
    corpus = []
    #generates the set of all words in the search space
    for item in answers:
        corpus += tokenize_doc(item)
    return set(corpus)

In [11]:
def index_answers(corpus, docs):
    inverted_index = {term: [] for term in corpus}
    
    for doc_id, terms in docs.items():
        for term in terms:
            if term in corpus:
                inverted_index[term].append(doc_id)
            
                
    return inverted_index

In [12]:
import math

def idf(index,term,docs):
    doc_freq = len(index[term])
    return math.log(doc_freq+1/len(docs)+1)

In [13]:
def tf(term, doc):
    freq = doc.count(term)
    doc_len = len(doc)
    return freq / doc_len if doc_len > 0 else 0

In [14]:
def tf_idf(index,term,docs,doc):
    if term in doc:
        return tf(term,doc)*idf(index,term,docs)
    else:
        return 0

In [15]:
def avg_len(doc_list):
    sum=0
    for doc in doc_list:
        sum+=len(doc)
    
    return sum

In [16]:
def bm25(term_freq, doc_len, avg_doc_len, idf, k1=1.75, b=0.75):
    return idf * ((term_freq * (k1 + 1)) / (term_freq + k1 * (1 - b + b * (doc_len / avg_doc_len))))

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/calebwentworth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
parser = ArgumentParser()
parser.add_argument('-i', '--input', required=True, help='search domain file e.g. Answers.json',default="Answers.json")
parser.add_argument('-q', '--query', required=True, help='query source files e.g. topics_1.json',default="topics_1.json")
parser.add_argument('-e', '--eval', required=False, help='qrel evaluation file (optional)')

args = parser.parse_args()
input_file = args.input
query_file = args.query
eval_file = args.eval

with open(input_file,'r') as file:
    answers = json.load(file)

with open(query_file,'r') as file:
    querys = json.load(file)

usage: ipykernel_launcher.py [-h] -i INPUT -q QUERY [-e EVAL]
ipykernel_launcher.py: error: the following arguments are required: -i/--input, -q/--query


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [19]:
input_file = "Answers.json"
query_file = "topics_1.json"
eval_file = "qrel_1.tsv"

with open(input_file,'r') as file:
    answers = json.load(file)

with open(query_file,'r') as file:
    querys = json.load(file)

In [20]:
#preprocess_query_remove_html(answers) #unneeded combined...

In [21]:
preprocess_answers(answers)

[{'Id': '3', 'Text': 'examples httpcontikicom', 'Score': '15'},
 {'Id': '12',
  'Text': 'cruises larger cruises often get close land comforts cruising big ships may regulated soon smaller ships often let get right onto land costs start least 5k for ultra cheap airborne you get fly antarctica ecological way see experience actually walking it also cheapest option',
  'Score': '52'},
 {'Id': '17',
  'Text': 'then get country take cash periodically much case issues assume often may always able find cash machine depending are edit  since said us cant suggest best card get avoidminimise foreign transaction fees asking personal finance se site likely best bet finding card is',
  'Score': '126'},
 {'Id': '18', 'Text': 'i draw 23 goes', 'Score': '20'},
 {'Id': '19', 'Text': 'they go far east poland bulgaria that', 'Score': '11'},
 {'Id': '20', 'Text': 'httpseat61com', 'Score': '64'},
 {'Id': '22', 'Text': '', 'Score': '11'},
 {'Id': '23',
  'Text': 'when using debitcredit card remember become b

In [22]:
print (tokenize_doc(answers[1]))

['cruises', 'larger', 'cruises', 'often', 'get', 'close', 'land', 'comforts', 'cruising', 'big', 'ships', 'may', 'regulated', 'soon', 'smaller', 'ships', 'often', 'let', 'get', 'right', 'onto', 'land', 'costs', 'start', 'least', '5k', 'for', 'ultra', 'cheap', 'airborne', 'you', 'get', 'fly', 'antarctica', 'ecological', 'way', 'see', 'experience', 'actually', 'walking', 'it', 'also', 'cheapest', 'option']


In [23]:
doc_list = answer_reduce(answers)

In [24]:
corpus = generate_corpus(answers)
print(corpus)



In [25]:
print(len(corpus))

108967


In [26]:
def bm_search(query, index, doc_list, idf_values, k1=1.75, b=0.75):
    query_terms = query.split()
    scores = defaultdict(float)
    avg_doc_len = sum(len(doc) for doc in doc_list.values()) / len(doc_list)
    
    for term in query_terms:
        if term in index:
            idf = idf_values[term]
            for doc_id in index[term]:
                doc = doc_list[doc_id]
                term_freq = doc.count(term)
                doc_len = len(doc)
                scores[doc_id] += bm25(term_freq, doc_len, avg_doc_len, idf, k1, b)
    
    return sorted(scores.items(), key=lambda item: item[1], reverse=True)

In [27]:
def precompute_idf(index, num_docs):
    idf = {}
    for term, doc_ids in index.items():
        df = len(doc_ids)
        idf[term] = math.log((num_docs - df + 0.5) / (df + 0.5) + 1)
    return idf

In [28]:
def tf_search(query, index, doc_list, idf_values):
    query_terms = query.split()
    scores = defaultdict(float)
    
    for doc_id, doc in doc_list.items():
        score = sum(tf(term, doc) * idf_values[term] for term in query_terms if term in index)
        scores[doc_id] = score
    
    return sorted(scores.items(), key=lambda item: item[1], reverse=True)

In [29]:
index = index_answers(corpus,doc_list)

idf_vals = precompute_idf(index,len(doc_list))

In [30]:
#test
#print(bm_search("Big cruises are fun ",index,doc_list, idf_vals))
#print(tf_search("Big cruises are fun",index,doc_list))

In [31]:
def parse_query(query_dict):
    stop_words = stopwords.words('english')
    querys = {}
    for item in query_dict:
        id = item['Id']
        title = item['Title']
        body = item['Body']

        title = remove_stopwords(title,stop_words)
        
        title = title.lower()
        title = remove_tags(BeautifulSoup(title, "html.parser"))
        title = re.sub(r'[^\w\s]', '',title)
        
        body = remove_stopwords(body,stop_words)
        body = body.lower()
        body = remove_tags(BeautifulSoup(body, "html.parser"))
        body = re.sub(r'[^\w\s]', '',body)
        
        text = title+' '+body
        querys[id] = text

    return querys

In [32]:
from tqdm import tqdm

'''def run_querys(querys, idf_vals):
    bm_results = []
    tf_results = []
    
    for query_id, query_text in tqdm(querys.items(), desc="Processing Queries"):
        bm_search_results = bm_search(query_text, index, doc_list, idf_vals)
        tf_search_results = tf_search(query_text, index, doc_list, idf_vals)
        
        bm_search_results_top5 = bm_search_results[:5]
        tf_search_results_top5 = tf_search_results[:5]
        
        bm_ranked = {doc_id: rank for rank, (doc_id, _) in enumerate(bm_search_results_top5, start=1)}
        tf_ranked = {doc_id: rank for rank, (doc_id, _) in enumerate(tf_search_results_top5, start=1)}
        
        bm_results.append({query_id: bm_ranked})
        tf_results.append({query_id: tf_ranked})
    
    return bm_results, tf_results'''

'def run_querys(querys, idf_vals):\n    bm_results = []\n    tf_results = []\n    \n    for query_id, query_text in tqdm(querys.items(), desc="Processing Queries"):\n        bm_search_results = bm_search(query_text, index, doc_list, idf_vals)\n        tf_search_results = tf_search(query_text, index, doc_list, idf_vals)\n        \n        bm_search_results_top5 = bm_search_results[:5]\n        tf_search_results_top5 = tf_search_results[:5]\n        \n        bm_ranked = {doc_id: rank for rank, (doc_id, _) in enumerate(bm_search_results_top5, start=1)}\n        tf_ranked = {doc_id: rank for rank, (doc_id, _) in enumerate(tf_search_results_top5, start=1)}\n        \n        bm_results.append({query_id: bm_ranked})\n        tf_results.append({query_id: tf_ranked})\n    \n    return bm_results, tf_results'

In [33]:
def process_query(query_id, query_text, index, doc_list, idf_vals):
    bm_search_results = bm_search(query_text, index, doc_list, idf_vals)
    tf_search_results = tf_search(query_text, index, doc_list, idf_vals)
    
    bm_search_results_top5 = bm_search_results[:5]
    tf_search_results_top5 = tf_search_results[:5]
    
    bm_ranked = {doc_id: rank for rank, (doc_id, _) in enumerate(bm_search_results_top5, start=1)}
    tf_ranked = {doc_id: rank for rank, (doc_id, _) in enumerate(tf_search_results_top5, start=1)}
    
    return query_id, bm_ranked, tf_ranked

In [34]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def run_querys(querys, idf_vals, index, doc_list):
    bm_results = []
    tf_results = []
    
    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(process_query, query_id, query_text, index, doc_list, idf_vals)
            for query_id, query_text in querys.items()
        ]
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing Queries"):
            query_id, bm_ranked, tf_ranked = future.result()
            bm_results.append({query_id: bm_ranked})
            tf_results.append({query_id: tf_ranked})
    
    return bm_results, tf_results

In [35]:
q_input = parse_query(querys)



In [None]:
result = run_querys(q_input,idf_vals,index,doc_list)

Processing Queries:  10%|█         | 102/1000 [02:25<16:47,  1.12s/it]

In [35]:
def result_gen(results,f_name):
    bm_result = results[0]
    tf_result = results[1]
    print(bm_result)
    q0="Q0"

In [None]:
result_gen(result,query_file)