# **Text, Web, & Media Analytics Assignment 2**

# Setup

In [1]:
import os
import pandas as pd

from ir_evaluations import calculate_precision, calculate_dcg, compare_models
from ir_models import BM25, JM_LM, My_PRM
from ir_tools import write_scores_to_file, get_top_15
from parsing_functions import parse_stop_words, parse_collection, parse_query, parse_query_set, parse_evaluations, parse_ranking_files

# Document & Query Parsing

See `parsing_functions.py` for relevent functions.

In [2]:
# Parse in stop words
stop_words = parse_stop_words('common-english-words.txt')

# Load the document set (series of collection objects)
document_set = {}
input_path = 'Data_Collection'
for collection_path in os.listdir(input_path):
    data_key = collection_path.split('_C', 1)[1]
    document_set[data_key] = parse_collection(stop_words, os.path.join(input_path, collection_path))

# Parse in query set, apply term specificity to parsed queries
query_frame = parse_query_set('the50Queries.txt')
query_frame['parsed_title'] = query_frame['title'].apply(lambda row: parse_query(row, stop_words))

# Experiment to see if adding quarter-weighted frequency of description element helps
query_frame['parsed_description'] = query_frame['description'].apply(lambda row: parse_query(row, stop_words) if row is not pd.NA else pd.NA)
query_frame['parsed_description'] = query_frame['parsed_description'].apply(lambda row: {k:v/4 for k,v in row.items()} if row is not pd.NA else pd.NA)

query_frame['parsed_query'] = query_frame.apply(
    lambda row: {**row['parsed_title'], **{k: v for k, v in row['parsed_description'].items() if k not in row['parsed_title']}} if row['parsed_description'] is not pd.NA else row['parsed_title'], 
    axis=1
)

# Pull query topic and weighted query into a single dict
query_set = dict(zip(query_frame['number'], query_frame['parsed_query']))

# Task 1, 2, & 3: Model Design
Relevant implementations for **Task 1 (BM25)**, **Task 2 (JMLM)**, and **Task 3 (Model Testing)** are available in the `ir_models.py` script.

# Task 4: Model Testing

In [3]:
# Initialise result dicts
BM25_results = {}
JM_LM_results = {}
My_PRM_results = {}

# Loop over queries/collection objects
for query_key, collection in document_set.items():
    # Rank documents
    BM25_results[query_key] = BM25(collection=collection, query=query_set[query_key])
    JM_LM_results[query_key] = JM_LM(collection=collection, query=query_set[query_key])
    My_PRM_results[query_key] = My_PRM(weighting_function=BM25, collection=collection, query=query_set[query_key], threshold=1.3, theta=7)

    # Save results
    write_scores_to_file(BM25_results[query_key], f"BM25_R{query_key}Ranking")
    write_scores_to_file(JM_LM_results[query_key], f"JM_LM_R{query_key}Ranking")
    write_scores_to_file(My_PRM_results[query_key], f"My_PRM_R{query_key}Ranking")

In [4]:
get_top_15(BM25_results)
get_top_15(JM_LM_results)
get_top_15(My_PRM_results)

Query101 (DocID Weight):
46547: 2.9777644656627484
46974: 2.9777644656627484
62325: 2.213369045529717
6146: 2.0419656581689014
22170: 1.8272527085782253
61780: 1.5196732848313443
82330: 0.9988487146894036
61329: 0.9829990176611005
22513: 0.6534727222114181
39496: 0.45253605614331166
18586: 0
26642: 0
26847: 0
27577: 0
30647: 0

Query102 (DocID Weight):
73038: 5.597536439007499
58476: 5.396003226086574
65414: 4.941821411455464
26061: 4.808367316992574
12769: 4.651702045541497
12767: 4.5644997672490994
33203: 4.188500348145242
82227: 3.847318435025257
76635: 3.7809029428924603
78836: 3.6656773882611375
57914: 3.5473999795785858
11922: 3.3180196986160233
25096: 3.021622734291996
29908: 2.97304932509468
28662: 2.9616720781903387

Query103 (DocID Weight):
14314: 4.570225534356681
54533: 3.4893394752976032
81463: 3.2676958770583466
27426: 3.2367949906083924
27106: 3.1137687580958056
59459: 2.8206742490549734
83370: 2.622425264273093
20159: 2.01699583660008
80988: 1.4052056878367924
9272: 1.3

# Task 5: Model Evaluation

In [5]:
# # Parse DATs (commented out for now as these are in memory if running from the top)
# BM25_results = parse_ranking_files('RankingOutputs', 'BM25')
# JM_LM_results = parse_ranking_files('RankingOutputs', 'JM_LM')
# My_PRM_results = parse_ranking_files('RankingOutputs', 'My_PRM')

# Parse in evaluation benchmarks
evaluations = parse_evaluations('EvaluationBenchmark/')

## Average Precision (MAP)

In [6]:
# Defining thresholds (used in both average precision and precision@10)
bm25_threshold = 0.6
jm_lm_threshold = 0.000000000000000004
prm_threshold = 0.005

In [7]:
# Calculate precision for each query
bm25_precision = calculate_precision(evaluations, BM25_results, bm25_threshold, top_k = None).rename({'precision': 'bm25_precision'}, axis = 1)

# Calculate precision for each query
jm_lm_precision = calculate_precision(evaluations, JM_LM_results, jm_lm_threshold, top_k = None).rename({'precision': 'jm_lm_precision'}, axis = 1)

# Calculate precision for each query
prm_precision = calculate_precision(evaluations, My_PRM_results, prm_threshold, top_k = None).rename({'precision': 'prm_precision'}, axis = 1)

# Merging results
average_precision = pd.merge((pd.merge(bm25_precision, jm_lm_precision, on='topic')), prm_precision, on='topic')
average_precision

Unnamed: 0,topic,bm25_precision,jm_lm_precision,prm_precision
0,101,0.444444,0.304348,0.625
1,102,0.669014,0.0,0.619048
2,103,0.181818,1.0,0.428571
3,104,0.594937,0.743056,0.421053
4,105,0.321429,0.0,0.0
5,106,0.25,0.090909,0.25
6,107,0.111111,0.0,0.133333
7,108,0.054054,0.0,0.0625
8,109,0.4,0.5,0.35
9,110,0.089286,0.333333,0.192308


## Precision @ 10

In [8]:
# Rank variable
top_k = 10

# Calculate precision for each model
bm25_precision_10 = calculate_precision(evaluations, BM25_results, bm25_threshold, top_k = top_k).rename({'precision': f'bm25_precision@{top_k}'}, axis = 1)
jm_lm_precision_10 = calculate_precision(evaluations, JM_LM_results, jm_lm_threshold, top_k = top_k).rename({'precision': f'jm_lm_precision@{top_k}'}, axis = 1)
prm_precision_10 = calculate_precision(evaluations, My_PRM_results, prm_threshold, top_k = top_k).rename({'precision': f'prm_precision@{top_k}'}, axis = 1)

# Merging results
precision_10 = pd.merge((pd.merge(bm25_precision_10, jm_lm_precision_10, on='topic')), prm_precision_10, on='topic')
precision_10

Unnamed: 0,topic,bm25_precision@10,jm_lm_precision@10,prm_precision@10
0,101,0.5,0.5,0.5
1,102,0.6,0.6,0.4
2,103,0.5,0.6,0.5
3,104,0.1,1.0,0.3
4,105,0.0,0.8,0.0
5,106,0.2,0.3,0.2
6,107,0.2,0.2,0.2
7,108,0.0,0.2,0.1
8,109,0.1,0.6,0.3
9,110,0.2,0.3,0.4


## DCG @ 10

In [9]:
# Rank variable
p = 10

# Calculate precision for each model
bm25_dcg_10 = calculate_dcg(evaluations, BM25_results, bm25_threshold, p = p).rename({'DCG': f'bm25_DCG_p{p}'}, axis = 1)
jm_lm_dcg_10 = calculate_dcg(evaluations, JM_LM_results, jm_lm_threshold, p = p).rename({'DCG': f'jm_lm_DCG_p{p}'}, axis = 1)
prm_dcg_10 = calculate_dcg(evaluations, My_PRM_results, prm_threshold, p = p).rename({'DCG': f'prm_DCG_p{p}'}, axis = 1)

# Merging results
dcg_10 = pd.merge((pd.merge(bm25_dcg_10, jm_lm_dcg_10, on='topic')), prm_dcg_10, on='topic')
dcg_10

Unnamed: 0,topic,bm25_DCG_p10,jm_lm_DCG_p10,prm_DCG_p10
0,101,2.987137,3.087914,3.120217
1,102,3.306035,0.0,2.173737
2,103,2.304666,1.0,3.39494
3,104,1.0,5.254495,1.116495
4,105,0.0,0.0,0.0
5,106,0.817529,1.93196,0.817529
6,107,1.356207,0.0,0.817529
7,108,0.0,0.0,0.386853
8,109,0.430677,3.804666,1.318813
9,110,0.786884,0.817529,1.972702


# Task 6: Recommendation

In [10]:
# Perform t-tests using average_precision scores
precision_names = ['prm_precision', 'bm25_precision', 'jm_lm_precision']#names with relevant information from 'average_precision'
precision_ttests = compare_models(average_precision, precision_names, 'avg_precision')

# Precision@10 as evaluation metric
precision_10_names = ['prm_precision@10', 'bm25_precision@10', 'jm_lm_precision@10']
precision_10_ttests = compare_models(precision_10, precision_10_names, 'precision@10')

# DCG10 as evaluation metric
dcg10_names = ['prm_DCG_p10', 'bm25_DCG_p10', 'jm_lm_DCG_p10']
dcg10_ttests = compare_models(dcg_10, dcg10_names, 'DCG10')

# Merge all of these into one dataframe, using the common column 'model_comparison'
ttest_results = pd.merge((pd.merge(precision_ttests, precision_10_ttests, on='model_comparison')), dcg10_ttests, on='model_comparison')

# Rounding to 3 decimal places for interpretability
ttest_results_rounded = pd.DataFrame(ttest_results).round(3)
ttest_results_rounded

Unnamed: 0,model_comparison,t-statistic_avg_precision,p-value_avg_precision,t-statistic_precision@10,p-value_precision@10,t-statistic_DCG10,p-value_DCG10
0,PRM vs BM25,0.738,0.464,1.414,0.164,0.01,0.992
1,PRM vs JM_LM,1.233,0.224,-2.068,0.044,1.781,0.081
2,BM25 vs JM_LM,1.025,0.31,-2.791,0.007,1.821,0.075


## Interpretations

**Average precision**
- No significant difference between PRM and BM25
- PRM outperforms JM_LM (p<0.05)
- BM25 outperforms JM_LM (p<0.05)

**Precision @ 10**
- No significant differences in means (p>0.05)

**DCG10**
- No significant difference between PRM and BM25
- PRM outperforms JM_LM (p<0.05)
- BM25 outperforms JMLM (p<0.05)