In [1]:
import os
import sys
import numpy as np
import pandas as pd
sys.path.append("../")
from nltk.stem import PorterStemmer
from data_extraction.utils import data_extraction
ps = PorterStemmer()

In [2]:
forward_index = data_extraction.load_json_file('../output/forward_index.txt')
inverted_index = data_extraction.load_json_file('../output/inverted_index.txt')

# Loading Stopwords

In [4]:
stopwords_path = '../stopwordlist.txt'
with open(stopwords_path, 'r') as file:
    stopwords = file.read()
    stopwords = stopwords.split('\n')
    stopwords_list = [word.strip().lower() for word in stopwords if word]

print("Sample stop words : ", stopwords_list[:10])

Sample stop words :  ['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards']


# Query Extraction

In [None]:
# extract text from all folders and files
query_path = '../input_query/topics.txt'
def retrieve_relevant_docs(title=True, description=False, narrative=False):
    result = []
    with open(query_path, 'r') as file:
        content = file.read()
        sub_files = content.split('<top>')
        sub_files = [file for file in sub_files if len(file) > 0]
        for file in sub_files:
            # Extracting topic number
            topic_num = data_extraction.extract_text_in_tag(file, 'num')
            topic_num = topic_num.removeprefix('Number: ')
            if title:
                # Extracting title
                query_title = data_extraction.extract_text_in_tag(file, 'title')
                query = query_title
            if description:
                description = data_extraction.extract_text_in_tag(file, 'desc')
                description = description.removeprefix('Description:')
                query += description
            if narrative:
                narrative = data_extraction.extract_text_in_tag(file, 'narr')
                query += narrative
            _, unique_words = data_extraction.tokenizer(query, stopwords_list)
            # stemming with Porter Stemmer
            stemmed_words = [ps.stem(word) for word in unique_words]
            related_documents = []
            for word in stemmed_words:
                try:
                    # getting all the document ids in which the query term appears
                    doc_ids = list(inverted_index[word].keys())
                except Exception as e:
                    continue
                related_documents.extend(doc_ids)
                unique_related_documents = list(set(related_documents))

            # Creating forward index for the query
            query_forward_index, unique_stemmed_words = data_extraction.get_forward_index(stemmed_words)
            total_documents = len(unique_related_documents)
            for docId in unique_related_documents:
                # iterate over each word in the query
                tf_idf_list = []
                query_term_freq_list = []
                for unique_word in unique_stemmed_words:
                    doc_freq = len(inverted_index.get(unique_word,[]))
                    if doc_freq > 0:
                        # calcuating tf-idf weight of the term in document    
                        term_freq = forward_index[docId].get(unique_word, 0)
                        # tf-idf weight
                        tf_idf = term_freq * np.log10(total_documents/doc_freq)
                        tf_idf_list.append(tf_idf)

                        # query vector
                        query_term_freq = query_forward_index[unique_word]
                        query_term_freq_list.append(query_term_freq)
                        
                # normalizing tf_idf vector
                tf_idf_vector = np.array(tf_idf_list)
                document_magnitude = np.linalg.norm(tf_idf_vector)
                normalized_tf_idf_vector = tf_idf_vector/document_magnitude

                # normalizing query vector
                query_vector = np.array(query_term_freq_list)
                query_magnitude = np.linalg.norm(query_vector)
                normalized_query_vector = query_vector/query_magnitude

                # dot_product of query weight and document weight of the term
                doc_similarity = np.dot(normalized_query_vector, normalized_tf_idf_vector)

                result.append([topic_num, docId, doc_similarity])
        df = pd.DataFrame(result, columns=['Topic', 'Document', 'Cosine_value'])
            
    return df

In [7]:
# Loading main.qrels as a dataframe
ground_truth = pd.read_csv('../main.qrels', sep='\s+', header=None, names=['Topic', 'Sequence', 'Document', 'Relevance'])
ground_truth

Unnamed: 0,Topic,Sequence,Document,Relevance
0,351,0,FT911-1098,0
1,351,0,FT911-1338,0
2,351,0,FT911-1396,0
3,351,0,FT911-18,0
4,351,0,FT911-1943,0
...,...,...,...,...
8309,400,0,FT923-5131,0
8310,400,0,FT923-5695,0
8311,400,0,FT923-5801,0
8312,400,0,FT923-7802,0


# Document retrieval with title

In [27]:
df = retrieve_relevant_docs(title=True, description=False, narrative=False)
df_v1 = data_extraction.sort_df_and_add_sequence(df)
df_v1

Unnamed: 0,Topic,Document,Sequence,Cosine_value
0,352,FT911-2661,1,0.997095
1,352,FT911-4067,2,0.986478
2,352,FT911-3370,3,0.984695
3,352,FT911-1768,4,0.964887
4,352,FT911-3432,5,0.964887
...,...,...,...,...
2552,359,FT911-1962,869,0.707107
2553,359,FT911-2754,870,0.707107
2554,359,FT911-1241,871,0.707107
2555,359,FT911-5253,872,0.707107


In [26]:
# saving results as a text file
df_v1.to_csv('../output/retieval_wt_title.txt', sep='\t', index=False)

Precision & Recall

In [28]:
true_positives, true_negatives, false_positives, false_negatives = data_extraction.compare_retrieved_docs_wt_gt(df_v1, ground_truth)

# Precision
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0

# Recall
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

print(f"Precision: {precision}")
print(f"Recall: {recall}")

print('\ntrue_positives :', true_positives)
print('true_negatives :', true_negatives)
print('false_positives :', false_positives)
print('false_negatives :', false_negatives)

Precision: 0.024247164646069613
Recall: 1.0

true_positives : 62
true_negatives : 0
false_positives : 2495
false_negatives : 0


# Document retrieval with title and description

In [10]:
df = retrieve_relevant_docs(title=True, description=True, narrative=False)
df_v2 = data_extraction.sort_df_and_add_sequence(df)
df_v2

Unnamed: 0,Topic,Document,Sequence,Cosine_value
0,352,FT911-241,1,0.848310
1,352,FT911-3645,2,0.843514
2,352,FT911-4059,3,0.805587
3,352,FT911-4561,4,0.776927
4,352,FT911-3050,5,0.767200
...,...,...,...,...
7620,359,FT911-5338,1482,0.447214
7621,359,FT911-852,1483,0.447214
7622,359,FT911-4697,1484,0.447214
7623,359,FT911-4889,1485,0.447214


In [18]:
# saving results as a text file
df_v2.to_csv('../output/retieval_wt_title_description.txt', sep='\t', index=False)

Precision & Recall

In [22]:
true_positives, true_negatives, false_positives, false_negatives = data_extraction.compare_retrieved_docs_wt_gt(df_v2, ground_truth)

# Precision
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0

# Recall
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

print(f"Precision: {precision}")
print(f"Recall: {recall}")

print('\ntrue_positives :', true_positives)
print('true_negatives :', true_negatives)
print('false_positives :', false_positives)
print('false_negatives :', false_negatives)

Precision: 0.010885245901639345
Recall: 1.0

true_positives : 83
true_negatives : 0
false_positives : 7542
false_negatives : 0


# Document retrieval with title and narrative

In [29]:
df = retrieve_relevant_docs(title=True, description=False, narrative=True)
df_v3 = data_extraction.sort_df_and_add_sequence(df)
df_v3

Unnamed: 0,Topic,Document,Sequence,Cosine_value
0,352,FT911-241,1,0.688846
1,352,FT911-1699,2,0.680730
2,352,FT911-3409,3,0.661184
3,352,FT911-4095,4,0.649259
4,352,FT911-4561,5,0.648331
...,...,...,...,...
11859,359,FT911-3220,2719,0.258199
11860,359,FT911-2237,2720,0.258199
11861,359,FT911-5338,2721,0.258199
11862,359,FT911-4697,2722,0.258199


In [19]:
# saving results as a text file
df_v3.to_csv('../output/retieval_wt_title_narrative.txt', sep='\t', index=False)

Precision & Recall

In [30]:
true_positives, true_negatives, false_positives, false_negatives = data_extraction.compare_retrieved_docs_wt_gt(df_v3, ground_truth)

# Precision
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0

# Recall
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

print(f"Precision: {precision}")
print(f"Recall: {recall}")

print('\ntrue_positives :', true_positives)
print('true_negatives :', true_negatives)
print('false_positives :', false_positives)
print('false_negatives :', false_negatives)

Precision: 0.008007417397167902
Recall: 1.0

true_positives : 95
true_negatives : 0
false_positives : 11769
false_negatives : 0


# Document retrieval with title, description and narrative

In [31]:
df = retrieve_relevant_docs(title=True, description=True, narrative=True)
df_v4 = data_extraction.sort_df_and_add_sequence(df)
df_v4

Unnamed: 0,Topic,Document,Sequence,Cosine_value
0,352,FT911-241,1,0.715232
1,352,FT911-5249,2,0.646455
2,352,FT911-4020,3,0.642432
3,352,FT911-1492,4,0.640868
4,352,FT911-370,5,0.623753
...,...,...,...,...
12596,359,FT911-2237,2777,0.235702
12597,359,FT911-5338,2778,0.235702
12598,359,FT911-852,2779,0.235702
12599,359,FT911-4697,2780,0.235702


In [20]:
# saving results as a text file
df_v4.to_csv('../output/retieval_wt_title_description_narrative.txt', sep='\t', index=False)

Precision & Recall

In [32]:
true_positives, true_negatives, false_positives, false_negatives = data_extraction.compare_retrieved_docs_wt_gt(df_v4, ground_truth)

# Precision
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0

# Recall
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

print(f"Precision: {precision}")
print(f"Recall: {recall}")

print('\ntrue_positives :', true_positives)
print('true_negatives :', true_negatives)
print('false_positives :', false_positives)
print('false_negatives :', false_negatives)

Precision: 0.0075390841996666935
Recall: 1.0

true_positives : 95
true_negatives : 0
false_positives : 12506
false_negatives : 0
