# Readme

This is a baseline model, which only uses BM25+ with pre-processing

# Preparation and indexing

In [2]:
import re
import xml.etree.ElementTree as ET
import warnings
import math
from stemming.porter2 import stem
import pandas as pd
from rank_bm25 import BM25Plus
import numpy as np


# preprocessing method
def string_tokenise(string):  # return list
    result = re.findall(r"\w+", string)
    return result


def case_fold(list1):  # return list
    result = [word.lower() for word in list1]
    #     string = ' '.join([str(elem) for elem in list1])
    #     result = string.lower().split() #lower() is the same as casefold()
    return result


def stopping(list1):  # return list
    stopfile = open("englishST.txt", 'r')
    stopwords = stopfile.read().split()
    result = [items for items in list1 if items not in stopwords]
    return result


def normalise(list1):  # return list
    result = []
    for item in list1:
        result.append(stem(item))
    return result


'''
=======================================Preparation: indexing======================================
'''

englishST = open('englishST.txt', 'r')

FILE = 'v3.0/verified_claims.docs.tsv'

record = {}  # {{string:{int:string}}} {{term:{docID:position}}}dic of dic, every insider dic records a term
docID_list = []  # list just for recording docID

fields = ['vclaim_id', 'vclaim', 'title']
dataframe = pd.read_csv(FILE, usecols = fields, sep = '\t')

docID = 0

preprocessed_corpus = [] #a list of lists of strings, which are the document tokens

for index, row in dataframe.iterrows():

    docID = row['vclaim_id']
    docID_list.append(docID)

    # step1: tokenise
    doc_in_str = row['vclaim'] # doc is in string format
    pos_in_doc = 0  # describe the position of terms in one doc

    term_in_list = normalise(
        stopping(case_fold(string_tokenise(doc_in_str))))  # pre-processing

    preprocessed_corpus.append(term_in_list) # add a list of cocument terms into the list of list of doc terms


bm25_indices = BM25Plus(preprocessed_corpus)

# np.save('03_BM25indices.npy', bm25_indices) # save the indices to avoid reproduce it every time

# Implement retrieval

In [4]:
# load the BM25Plus index data
# bm25_indices = np.load('03_BM25indices.npy', allow_pickle=True).copy

'''
========================================Document Retrieval=======================================
'''
import csv
from operator import itemgetter

RankedIROutput = open('04_results.tsv', 'w')
results_fields = ['tweet_id','Q0','vclaim_id','rank','score','tag']
writer = csv.DictWriter(RankedIROutput, fieldnames = results_fields)
writer.writeheader()

# NO HEADER NEEDED!!

# step1: extract the original queries from file
tweets_directory = 'v3.0/train/tweets.queries.tsv'
tweets_fields = ['tweet_id', 'tweet_content']
df_t = pd.read_csv(tweets_directory, usecols = tweets_fields, sep = '\t')
tweets_list = df_t.tweet_content.tolist().copy()
tweets_id = df_t.tweet_id.tolist().copy()


for query_id, query in zip(tweets_id,tweets_list):
    query_term = normalise(
        stopping(case_fold(string_tokenise(query))))  # same preprocessing as for indexing
    vclaim_scores = bm25_indices.get_scores(query_term)
    
    search_result = sorted(zip(docID_list, vclaim_scores),key = itemgetter(1), reverse=True) # sorted by the second column in the tuple

    # handle the case that cannot find any matches
    if vclaim_scores == []:
        search_result = []

    # write into submitted file
    count = 0  # provide up to 150 result
    for matched_docID, matched_score in search_result:
        count = count + 1
        if count > 30: # output top 1 result
            break
            
#         RankedIROutput.write(str(query_id) + ',' + str(matched_entry[0]) + ',' + str(round(matched_entry[1],4)) +
#                             ' ||' + dataframe.loc[dataframe['docID']==matched_entry[0]]['content'].item() + '\n')

        tweet_id = query_id
        vclaim_id = matched_docID
        score = matched_score
        tag = 'DC'
        return_data = {'tweet_id':tweet_id, 'Q0':'Q0', 'vclaim_id':vclaim_id, 'rank': '1',
                      'score': score, 'tag': tag}
        writer = csv.DictWriter(RankedIROutput, fieldnames = results_fields, delimiter='\t')
        writer.writerow(return_data)
        
        
RankedIROutput.close()

  if vclaim_scores == []:


In [50]:
# experiment 
a = [22,10000,3232323]
b = ['zerer','fdfdf','zzdfdfdfdfd']
from operator import itemgetter
print(sorted(zip(a,b), key = itemgetter(0), reverse=True))

[(3232323, 'zzdfdfdfdfd'), (10000, 'fdfdf'), (22, 'zerer')]
