# Dependencies:

In [None]:
!pip install transformers torch

import os
import json
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics.pairwise import linear_kernel
import nltk
from nltk.corpus import stopwords

import matplotlib.pyplot as plt

import numpy as np
from tqdm.auto import tqdm




Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

# 0. Loading data files

In [None]:
# PATHS
train_queries_file = "train_queries.json"
test_queries_file = "test_queries.json"
train_gold_mapping_file = "train_gold_mapping.json"
shuffled_pre_ranking_file = "shuffled_pre_ranking.json"
queries_content_file = "queries_content_with_features.json"
documents_content_file = "documents_content_with_features.json"

test_predictions_file = "prediction2.json"

if not all(os.path.exists(f) for f in [test_queries_file, shuffled_pre_ranking_file, queries_content_file, documents_content_file]):
    print("Error: One or more necessary data files for the test set are missing.")
    exit()
else:
    print("Necessary data files for the test set found.")

Necessary data files for the test set found.


In [None]:
# LOAD

def load_json_file(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

test_queries = load_json_file(test_queries_file)
print(f"Loaded {len(test_queries)} test queries.")

pre_ranking_test = load_json_file(shuffled_pre_ranking_file)
# filter pre-ranking to include only test queries (important!)
pre_ranking_test_filtered = {fan: docs for fan, docs in pre_ranking_test.items() if fan in test_queries}
print(f"Filtered pre-ranking to {len(pre_ranking_test_filtered)} test queries.")


queries_content = load_json_file(queries_content_file)
documents_content = load_json_file(documents_content_file)


if not os.path.exists("cross_encoder_reranking_train.py"):
    print("Error: The 'cross_encoder_reranking_train.py' script is missing. Please upload it.")
    exit()
else:
    print("Reranking script found.")

Loaded 10 test queries.
Filtered pre-ranking to 10 test queries.
Reranking script found.


# 1. Re-ranking the results with cross-encoder:

We will use the model ` "intfloat/e5-large-v2" `with different combinations of the documents parts like: `ta`, `tac1`....etc

In [None]:
base_dir ="/content" # adapt this base dir according to your folders  (/content when using google colab and the files are directly in /content)


## 1.1. E5 LARGE TA

In [None]:
# E5 LARGE ta

best_model_name = "intfloat/e5-large-v2"
best_text_type = "TA"
max_length = 512

print("\nRunning reranking on the test set...")
!python cross_encoder_reranking_train.py \
    --model_name "{best_model_name}" \
    --text_type "{best_text_type}" \
    --pre_ranking "{shuffled_pre_ranking_file}" \
    --queries_list "{test_queries_file}" \
    --queries_content "{queries_content_file}" \
    --documents_content "{documents_content_file}" \
    --output "{test_predictions_file}" \
    --max_length {max_length} \
    --base_dir "{base_dir}"

print(f"\nTest set predictions saved to: {test_predictions_file}")


Running reranking on the test set...
Loading training queries from test_queries.json...
Loaded 10 training queries
Loading pre-ranking data from shuffled_pre_ranking.json...
Filtered pre-ranking to 10 training queries
Loading query content from queries_content_with_features.json...
Loading document content from documents_content_with_features.json...
Loading model intfloat/e5-large-v2...
tokenizer_config.json: 100% 314/314 [00:00<00:00, 2.66MB/s]
vocab.txt: 100% 232k/232k [00:00<00:00, 5.93MB/s]
tokenizer.json: 100% 711k/711k [00:00<00:00, 15.3MB/s]
special_tokens_map.json: 100% 125/125 [00:00<00:00, 1.25MB/s]
config.json: 100% 616/616 [00:00<00:00, 4.77MB/s]
model.safetensors: 100% 1.34G/1.34G [00:05<00:00, 245MB/s]
Starting re-ranking process for training queries...
Processing queries:   0% 0/10 [00:00<?, ?it/s]
Re-ranking 30 documents for training query 103964109
Original pre-ranking (first 3): ['94596291', '65451984', '81098918']

Scoring documents:   0% 0/8 [00:00<?, ?it/s][A
Sc

**REMARK**: We rename the `Test set predictions saved to: prediction2.json`to `prediction2_e5Large_TA.json`

## 1.2. E5 LARGE tac1

In [None]:
# E5 LARGE tac1

best_model_name = "intfloat/e5-large-v2"
best_text_type = "tac1"
max_length = 512

print("\nRunning reranking on the test set...")
!python cross_encoder_reranking_train.py \
    --model_name "{best_model_name}" \
    --text_type "{best_text_type}" \
    --pre_ranking "{shuffled_pre_ranking_file}" \
    --queries_list "{test_queries_file}" \
    --queries_content "{queries_content_file}" \
    --documents_content "{documents_content_file}" \
    --output "{test_predictions_file}" \
    --max_length {max_length} \
    --base_dir "{base_dir}"

print(f"\nTest set predictions saved to: {test_predictions_file}")



Running reranking on the test set...
Loading training queries from test_queries.json...
Loaded 10 training queries
Loading pre-ranking data from shuffled_pre_ranking.json...
Filtered pre-ranking to 10 training queries
Loading query content from queries_content_with_features.json...
Loading document content from documents_content_with_features.json...
Loading model intfloat/e5-large-v2...
Starting re-ranking process for training queries...
Processing queries:   0% 0/10 [00:00<?, ?it/s]
Re-ranking 30 documents for training query 103964109
Original pre-ranking (first 3): ['94596291', '65451984', '81098918']

Scoring documents:   0% 0/8 [00:00<?, ?it/s][A
Scoring documents:  12% 1/8 [00:02<00:18,  2.59s/it][A
Scoring documents:  25% 2/8 [00:05<00:15,  2.58s/it][A
Scoring documents:  38% 3/8 [00:07<00:12,  2.57s/it][A
Scoring documents:  50% 4/8 [00:09<00:09,  2.27s/it][A
Scoring documents:  62% 5/8 [00:11<00:06,  2.19s/it][A
Scoring documents:  75% 6/8 [00:13<00:04,  2.04s/it][A
Sc

**REMARK**: We rename the `Test set predictions saved to: prediction2.json`to `prediction2_e5Large_TAC1.json`

## 1.3. E5 LARGE CLAIMS

In [None]:
# E5 LARGE CLAIMS

best_model_name = "intfloat/e5-large-v2"
best_text_type = "claims"
max_length = 512

print("\nRunning reranking on the test set...")
!python cross_encoder_reranking_train.py \
    --model_name "{best_model_name}" \
    --text_type "{best_text_type}" \
    --pre_ranking "{shuffled_pre_ranking_file}" \
    --queries_list "{test_queries_file}" \
    --queries_content "{queries_content_file}" \
    --documents_content "{documents_content_file}" \
    --output "{test_predictions_file}" \
    --max_length {max_length} \
    --base_dir "{base_dir}"

print(f"\nTest set predictions saved to: {test_predictions_file}")


Running reranking on the test set...
Loading training queries from test_queries.json...
Loaded 10 training queries
Loading pre-ranking data from shuffled_pre_ranking.json...
Filtered pre-ranking to 10 training queries
Loading query content from queries_content_with_features.json...
Loading document content from documents_content_with_features.json...
Loading model intfloat/e5-large-v2...
Starting re-ranking process for training queries...
Processing queries:   0% 0/10 [00:00<?, ?it/s]
Re-ranking 30 documents for training query 103964109
Original pre-ranking (first 3): ['94596291', '65451984', '81098918']

Scoring documents:   0% 0/8 [00:00<?, ?it/s][A
Scoring documents:  12% 1/8 [00:02<00:18,  2.68s/it][A
Scoring documents:  25% 2/8 [00:05<00:15,  2.63s/it][A
Scoring documents:  38% 3/8 [00:07<00:12,  2.52s/it][A
Scoring documents:  50% 4/8 [00:10<00:09,  2.46s/it][A
Scoring documents:  62% 5/8 [00:12<00:07,  2.45s/it][A
Scoring documents:  75% 6/8 [00:15<00:05,  2.51s/it][A
Sc

**REMARK**: We rename the `Test set predictions saved to: prediction2.json`to `prediction2_e5Large_CLAIMS.json`

## 1.4. MPNET TA

In [None]:
# MPNET TA

best_model_name = "sentence-transformers/all-mpnet-base-v2"

best_text_type = "TA"
max_length = 512

print("\nRunning reranking on the test set...")
!python cross_encoder_reranking_train.py \
    --model_name "{best_model_name}" \
    --text_type "{best_text_type}" \
    --pre_ranking "{shuffled_pre_ranking_file}" \
    --queries_list "{test_queries_file}" \
    --queries_content "{queries_content_file}" \
    --documents_content "{documents_content_file}" \
    --output "{test_predictions_file}" \
    --max_length {max_length} \
    --base_dir "{base_dir}"

print(f"\nTest set predictions saved to: {test_predictions_file}")


Running reranking on the test set...
Loading training queries from test_queries.json...
Loaded 10 training queries
Loading pre-ranking data from shuffled_pre_ranking.json...
Filtered pre-ranking to 10 training queries
Loading query content from queries_content_with_features.json...
Loading document content from documents_content_with_features.json...
Loading model sentence-transformers/all-mpnet-base-v2...
tokenizer_config.json: 100% 363/363 [00:00<00:00, 3.27MB/s]
vocab.txt: 100% 232k/232k [00:00<00:00, 5.33MB/s]
tokenizer.json: 100% 466k/466k [00:00<00:00, 892kB/s]
special_tokens_map.json: 100% 239/239 [00:00<00:00, 1.43MB/s]
config.json: 100% 571/571 [00:00<00:00, 4.45MB/s]
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
model.safetensors: 100% 438M/438M [00:01<00:00, 255MB/s]
Starting re-ranking proc

**REMARK**: We rename the `Test set predictions saved to: prediction2.json`to `prediction2_MPNET_TA.json`

## 1.5. MPNET CLAIMS

In [None]:
# MPNET CLAIMS

best_model_name = "sentence-transformers/all-mpnet-base-v2"

best_text_type = "claims"
max_length = 512

print("\nRunning reranking on the test set...")
!python cross_encoder_reranking_train.py \
    --model_name "{best_model_name}" \
    --text_type "{best_text_type}" \
    --pre_ranking "{shuffled_pre_ranking_file}" \
    --queries_list "{test_queries_file}" \
    --queries_content "{queries_content_file}" \
    --documents_content "{documents_content_file}" \
    --output "{test_predictions_file}" \
    --max_length {max_length} \
    --base_dir "{base_dir}"

print(f"\nTest set predictions saved to: {test_predictions_file}")


Running reranking on the test set...
Loading training queries from test_queries.json...
Loaded 10 training queries
Loading pre-ranking data from shuffled_pre_ranking.json...
Filtered pre-ranking to 10 training queries
Loading query content from queries_content_with_features.json...
Loading document content from documents_content_with_features.json...
Loading model sentence-transformers/all-mpnet-base-v2...
Starting re-ranking process for training queries...
Processing queries:   0% 0/10 [00:00<?, ?it/s]
Re-ranking 30 documents for training query 103964109
Original pre-ranking (first 3): ['94596291', '65451984', '81098918']

Scoring documents:   0% 0/8 [00:00<?, ?it/s][A
Scoring documents:  12% 1/8 [00:00<00:06,  1.03it/s][A
Scoring documents:  25% 2/8 [00:01<00:05,  1.08it/s][A
Scoring documents:  38% 3/8 [00:02<00:04,  1.11it/s][A
Scoring documents:  50% 4/8 [00:03<00:03,  1.13it/s][A
Scoring documents:  62% 5/8 [00:04<00:02,  1.13it/s][A
Scoring documents:  75% 6/8 [00:05<00:0

**REMARK**: We rename the `Test set predictions saved to: prediction2.json`to `prediction2_MPNET_CLAIMS.json`

## 1.5+. MPNET full (very bad)

In [None]:
# MPNET CLAIMS

best_model_name = "sentence-transformers/all-mpnet-base-v2"

best_text_type = "full"
max_length = 512

print("\nRunning reranking on the test set...")
!python cross_encoder_reranking_train.py \
    --model_name "{best_model_name}" \
    --text_type "{best_text_type}" \
    --pre_ranking "{shuffled_pre_ranking_file}" \
    --queries_list "{test_queries_file}" \
    --queries_content "{queries_content_file}" \
    --documents_content "{documents_content_file}" \
    --output "{test_predictions_file}" \
    --max_length {max_length} \
    --base_dir "{base_dir}"

print(f"\nTest set predictions saved to: {test_predictions_file}")


Running reranking on the test set...
Loading training queries from test_queries.json...
Loaded 10 training queries
Loading pre-ranking data from shuffled_pre_ranking.json...
Filtered pre-ranking to 10 training queries
Loading query content from queries_content_with_features.json...
Loading document content from documents_content_with_features.json...
Loading model sentence-transformers/all-mpnet-base-v2...
2025-04-08 20:40:35.933535: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744144835.980489    4422 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744144835.992981    4422 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-08 20:40:36.034571

**REMARK**: We rename the `Test set predictions saved to: prediction2.json`to `prediction2_MPNET_full.json`

Reading truth and prediction
Checking Accuracy
Scores:
{'recall_at_3': 0.08055555555555556, 'recall_at_5': 0.10555555555555556, 'recall_at_10': 0.2595238095238095, 'recall_at_20': 0.7492063492063492, 'mean_rank': 14.783333333333331, 'mean_inv_rank': 0.10828456299601233, 'mean_average_precision': 0.19408953742409024}
Scoring completed

## 1.5++. MPNET description

In [108]:
# MPNET description

best_model_name = "sentence-transformers/all-mpnet-base-v2"

best_text_type = "description"
max_length = 512

print("\nRunning reranking on the test set...")
!python cross_encoder_reranking_train.py \
    --model_name "{best_model_name}" \
    --text_type "{best_text_type}" \
    --pre_ranking "{shuffled_pre_ranking_file}" \
    --queries_list "{test_queries_file}" \
    --queries_content "{queries_content_file}" \
    --documents_content "{documents_content_file}" \
    --output "{test_predictions_file}" \
    --max_length {max_length} \
    --base_dir "{base_dir}"

print(f"\nTest set predictions saved to: {test_predictions_file}")


Running reranking on the test set...
Loading training queries from test_queries.json...
Loaded 10 training queries
Loading pre-ranking data from shuffled_pre_ranking.json...
Filtered pre-ranking to 10 training queries
Loading query content from queries_content_with_features.json...
Loading document content from documents_content_with_features.json...
Loading model sentence-transformers/all-mpnet-base-v2...
2025-04-09 00:40:54.119862: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744159254.153456   61864 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744159254.163426   61864 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-09 00:40:54.212405

**REMARK**: We rename the `Test set predictions saved to: prediction2.json`to `prediction2_MPNET_description.json`

## 1.5 +++. MPNET description + features


In [130]:
best_model_name = "sentence-transformers/all-mpnet-base-v2"

best_text_type = "description_features"
max_length = 512

print("\nRunning reranking on the test set...")
!python cross_encoder_reranking_train.py \
    --model_name "{best_model_name}" \
    --text_type "{best_text_type}" \
    --pre_ranking "{shuffled_pre_ranking_file}" \
    --queries_list "{test_queries_file}" \
    --queries_content "{queries_content_file}" \
    --documents_content "{documents_content_file}" \
    --output "{test_predictions_file}" \
    --max_length {max_length} \
    --base_dir "{base_dir}"

print(f"\nTest set predictions saved to: {test_predictions_file}")


Running reranking on the test set...
Loading training queries from test_queries.json...
Loaded 10 training queries
Loading pre-ranking data from shuffled_pre_ranking.json...
Filtered pre-ranking to 10 training queries
Loading query content from queries_content_with_features.json...
Loading document content from documents_content_with_features.json...
Loading model sentence-transformers/all-mpnet-base-v2...
2025-04-09 02:12:07.679515: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744164727.715707   83684 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744164727.725714   83684 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-09 02:12:07.773526

**REMARK**: We rename the `Test set predictions saved to: prediction2.json`to `prediction2_MPNET_DescFeat.json`

## 1.6. BGE TA

In [None]:
# BGE TA

best_model_name = "BAAI/bge-large-en"

best_text_type = "tac1"
max_length = 512

print("\nRunning reranking on the test set...")
!python cross_encoder_reranking_train.py \
    --model_name "{best_model_name}" \
    --text_type "{best_text_type}" \
    --pre_ranking "{shuffled_pre_ranking_file}" \
    --queries_list "{test_queries_file}" \
    --queries_content "{queries_content_file}" \
    --documents_content "{documents_content_file}" \
    --output "{test_predictions_file}" \
    --max_length {max_length}

print(f"\nTest set predictions saved to: {test_predictions_file}")


Running reranking on the test set...
Loading training queries from test_queries.json...
Loaded 10 training queries
Loading pre-ranking data from shuffled_pre_ranking.json...
Filtered pre-ranking to 10 training queries
Loading query content from queries_content_with_features.json...
Loading document content from documents_content_with_features.json...
Loading model BAAI/bge-large-en...
2025-04-07 16:57:57.396788: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744045077.475814  135585 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744045077.496851  135585 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-07 16:57:57.597356: I tensorflow/core/pl

## 1.7. GEMINI try (an attempts)

For this model there are some limitations

In [None]:
# GEMINI TRY

import time
import os
import json
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from google import genai

# Replace with your actual Gemini API key
GEMINI_API_KEY = "ton api"

# Initialize Gemini Client
client = genai.Client(api_key=GEMINI_API_KEY)
embedding_model_name = "gemini-embedding-exp-03-07"

# Define the paths to your data files
train_queries_file = "train_queries.json"
test_queries_file = "test_queries.json"
train_gold_mapping_file = "train_gold_mapping.json"
shuffled_pre_ranking_file = "shuffled_pre_ranking.json"
queries_content_file = "queries_content_with_features.json"
documents_content_file = "documents_content_with_features.json"

# Define the output file for test predictions
test_predictions_file = "predictions_gemini_exp.json"

# Check if necessary data files exist
if not all(os.path.exists(f) for f in [test_queries_file, shuffled_pre_ranking_file, queries_content_file, documents_content_file]):
    print("Error: One or more necessary data files for the test set are missing.")
    exit()
else:
    print("Necessary data files for the test set found.")

def load_json_file(file_path):
    """Load JSON data from a file"""
    with open(file_path, 'r') as f:
        return json.load(f)

def save_json_file(data, file_path):
    """Save data to a JSON file"""
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=2)

def load_content_data(file_path):
    """Load content data from a JSON file"""
    with open(file_path, 'r') as f:
        data = json.load(f)
    # Create a dictionary mapping FAN to Content
    content_dict = {item['FAN']: item['Content'] for item in data}
    return content_dict

def extract_text(content_dict, text_type="full"):
    """Extract text from patent content based on text_type"""
    if text_type == "TA" or text_type == "title_abstract":
        title = content_dict.get("title", "")
        abstract = content_dict.get("pa01", "")
        return f"{title} {abstract}".strip()
    elif text_type == "claims":
        claims = " ".join([v for k, v in content_dict.items() if k.startswith('c-')])
        return claims.strip()
    elif text_type == "description":
        description = " ".join([v for k, v in content_dict.items() if k.startswith('p')])
        return description.strip()
    elif text_type == "full":
        all_text = []
        if "title" in content_dict:
            all_text.append(content_dict["title"])
        if "pa01" in content_dict:
            all_text.append(content_dict["pa01"])
        for key, value in content_dict.items():
            if key not in ["title", "pa01"]:
                all_text.append(value)
        return " ".join(all_text).strip()
    elif text_type == "tac1":
        title = content_dict.get("title", "")
        abstract = content_dict.get("pa01", "")
        first_claim = next((v for k, v in content_dict.items() if k.startswith('c-')), "")
        return f"{title} {abstract} {first_claim}".strip()
    return ""

def get_embedding_gemini(text_list, batch_size=1):  # Adjust batch_size
    """Get embeddings for a list of texts using the specified Gemini embedding model with batching."""
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        try:
            result = client.models.embed_content(
                model=embedding_model_name,
                contents=batch,  # Send a list of texts
            )
            if result.embeddings:
                for embedding in result.embeddings:
                    embeddings.append(embedding.values)
            else:
                print(f"Warning: No embeddings returned for the batch starting at index {i}")
                return None
            time.sleep(1)  # Add a small delay to avoid rate limits
        except Exception as e:
            print(f"Error getting Gemini embeddings for batch starting at index {i}: {e}")
            return None
    return embeddings

# Load test queries and pre-ranking
test_queries = load_json_file(test_queries_file)
print(f"Loaded {len(test_queries)} test queries.")
pre_ranking_test = load_json_file(shuffled_pre_ranking_file)
pre_ranking_test_filtered = {fan: docs for fan, docs in pre_ranking_test.items() if fan in test_queries}
print(f"Filtered pre-ranking to {len(pre_ranking_test_filtered)} test queries.")

# Load content data
queries_content = load_content_data(queries_content_file)
documents_content = load_content_data(documents_content_file)

# Rerank using Gemini embeddings
re_ranked_predictions = {}
best_text_type = "claims"  # You can experiment with other text types

print("\nStarting reranking process for test queries using Gemini Embedding Model...")
for query_fan, pre_ranked_docs in tqdm(pre_ranking_test_filtered.items(), desc="Processing queries"):
    if query_fan not in queries_content:
        print(f"Warning: Query FAN {query_fan} not found in content.")
        re_ranked_predictions[query_fan] = pre_ranked_docs  # Keep original ranking
        continue

    query_text = extract_text(queries_content[query_fan], best_text_type)
    doc_texts = []
    doc_fans = []
    for doc_fan in pre_ranked_docs:
        if doc_fan in documents_content:
            doc_texts.append(extract_text(documents_content[doc_fan], best_text_type))
            doc_fans.append(doc_fan)
        else:
            print(f"Warning: Document FAN {doc_fan} not found in content.")

    if not doc_texts:
        re_ranked_predictions[query_fan] = []
        continue

    all_texts = [query_text] + doc_texts
    embeddings = get_embedding_gemini(all_texts)

    if embeddings and len(embeddings) == len(all_texts):
        query_embedding = embeddings[0]
        doc_embeddings = embeddings[1:]
        similarity_scores = cosine_similarity([query_embedding], doc_embeddings)[0]
        ranked_indices = np.argsort(similarity_scores)[::-1]
        re_ranked_predictions[query_fan] = [doc_fans[i] for i in ranked_indices]
    else:
        print(f"Warning: Could not get embeddings for query {query_fan}. Keeping original ranking.")
        re_ranked_predictions[query_fan] = pre_ranked_docs

# Save the re-ranked predictions
save_json_file(re_ranked_predictions, test_predictions_file)
print(f"\nTest set predictions saved to: {test_predictions_file}")


Necessary data files for the test set found.
Loaded 10 test queries.
Filtered pre-ranking to 10 test queries.

Starting reranking process for test queries using Gemini Embedding Model...


Processing queries:  10%|█         | 1/10 [00:12<01:53, 12.65s/it]

Error getting Gemini embeddings for batch starting at index 6: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}


Processing queries:  40%|████      | 4/10 [00:13<00:12,  2.04s/it]

Error getting Gemini embeddings for batch starting at index 0: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}
Error getting Gemini embeddings for batch starting at index 0: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}
Error getting Gemini embeddings for batch starting at index 0: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}


Processing queries:  60%|██████    | 6/10 [00:13<00:04,  1.11s/it]

Error getting Gemini embeddings for batch starting at index 0: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}
Error getting Gemini embeddings for batch starting at index 0: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}
Error getting Gemini embeddings for batch starting at index 0: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}


Processing queries: 100%|██████████| 10/10 [00:13<00:00,  1.35s/it]

Error getting Gemini embeddings for batch starting at index 0: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}
Error getting Gemini embeddings for batch starting at index 0: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}
Error getting Gemini embeddings for batch starting at index 0: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}

Test set predictions saved to: predictions_gemini_exp.json

Remember to download '{test_predictions_file}' and submit it to Codabench.





In [None]:
!pip install fuzzywuzzy
import os
import json
import math
from collections import Counter
from fuzzywuzzy import fuzz  # pip install fuzzywuzzy
from nltk.corpus import wordnet  # pip install nltk (and download wordnet data: import nltk; nltk.download('wordnet'))

# PATHS
train_queries_file = "train_queries.json"
test_queries_file = "test_queries.json"
train_gold_mapping_file = "train_gold_mapping.json"
shuffled_pre_ranking_file = "shuffled_pre_ranking.json"
queries_content_file = "queries_content_with_features.json"
documents_content_file = "documents_content_with_features.json"
test_predictions_file = "creative_predictions.json"

def load_json_file(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

if not all(os.path.exists(f) for f in [test_queries_file, shuffled_pre_ranking_file, queries_content_file, documents_content_file, train_gold_mapping_file, queries_content_file, documents_content_file, train_queries_file]):
    print("Error: One or more necessary data files are missing.")
    exit()
else:
    print("Necessary data files found.")

test_queries = load_json_file(test_queries_file)
pre_ranking_test = load_json_file(shuffled_pre_ranking_file)
pre_ranking_test_filtered = {fan: docs for fan, docs in pre_ranking_test.items() if fan in test_queries}
train_gold_mapping = load_json_file(train_gold_mapping_file)
train_queries = load_json_file(train_queries_file)

# --- Load and Transform queries_content ---
queries_content_list = load_json_file(queries_content_file)
queries_content = {}
for item in queries_content_list:
    patent_id = item.get("patent_id")
    if patent_id:
        queries_content[patent_id] = item
print(f"Loaded and processed {len(queries_content)} query content items.")

# --- Load and Transform documents_content ---
documents_content_list = load_json_file(documents_content_file)
documents_content = {}
for item in documents_content_list:
    patent_id = item.get("patent_id")
    if patent_id:
        documents_content[patent_id] = item
print(f"Loaded and processed {len(documents_content)} document content items.")

queries_content_train = {k: v for k, v in queries_content.items() if k in train_queries}
documents_content_train = documents_content

def prepare_feature_weights_tfidf(train_gold_mapping, queries_content, documents_content):
    doc_feature_counts = {}
    all_docs = {**queries_content, **documents_content}
    total_num_docs = len(all_docs)

    for doc_id, content in all_docs.items():
        features = content.get('features', [])
        doc_feature_counts[doc_id] = Counter(features)

    feature_doc_frequency = Counter()
    for doc_id, counts in doc_feature_counts.items():
        for feature in counts:
            feature_doc_frequency[feature] += 1

    feature_weights_tfidf = {}
    for doc_id, counts in doc_feature_counts.items():
        for feature, count in counts.items():
            tf = count / (sum(counts.values()) + 1e-6)
            idf = math.log(total_num_docs / (feature_doc_frequency[feature] + 1) + 1e-6)
            feature_weights_tfidf[feature] = feature_weights_tfidf.get(feature, 0) + tf * idf

    return dict(feature_weights_tfidf)

def calculate_feature_similarity_improved(query_features, doc_features, feature_weights_tfidf=None):
    score = 0

    # 1. TF-IDF Weighted Overlap
    if feature_weights_tfidf:
        common_features = set(query_features) & set(doc_features)
        for feature in common_features:
            score += feature_weights_tfidf.get(feature, 0)

    # 2. Fuzzy Matching
    fuzzy_score = 0
    for q_feature in query_features:
        for d_feature in doc_features:
            ratio = fuzz.ratio(q_feature, d_feature)
            if ratio > 85:  # Increased threshold
                fuzzy_score += ratio / 100.0 * 0.2  # Reduced weight

    score += fuzzy_score

    # 3. N-gram Overlap (bi-grams)
    def get_ngrams(text, n):
        n_grams = set()
        words = text.split()
        for i in range(len(words) - n + 1):
            n_grams.add(" ".join(words[i:i+n]))
        return n_grams

    ngram_overlap_score = 0
    for q_feature in query_features:
        for d_feature in doc_features:
            q_2grams = get_ngrams(q_feature, 2)
            d_2grams = get_ngrams(d_feature, 2)
            overlap = len(q_2grams & d_2grams)
            union = len(q_2grams | d_2grams)
            if union > 0:
                ngram_overlap_score += overlap / union * 0.1  # Jaccard-like

    score += ngram_overlap_score

    return score

def creative_reranking(pre_ranking, queries_content, documents_content, feature_weights_tfidf=None):
    ranked_results = {}
    for query_id, initial_ranking in pre_ranking.items():
        if query_id in queries_content:
            query_features = queries_content[query_id].get('features', [])
            scored_documents = []
            for doc_id in initial_ranking:
                if doc_id in documents_content:
                    doc_features = documents_content[doc_id].get('features', [])
                    similarity_score = calculate_feature_similarity_improved(
                        query_features,
                        doc_features,
                        feature_weights_tfidf=feature_weights_tfidf
                    )
                    scored_documents.append((doc_id, similarity_score))
            scored_documents.sort(key=lambda item: item[1], reverse=True)
            ranked_results[query_id] = [doc_id for doc_id, score in scored_documents]
        else:
            ranked_results[query_id] = initial_ranking
    return ranked_results

# --- Prepare Feature Weights using TF-IDF ---
feature_weights_tfidf = prepare_feature_weights_tfidf(train_gold_mapping, queries_content_train, documents_content_train)

# --- Perform Creative Reranking on Test Set ---
reranked_predictions = creative_reranking(
    pre_ranking_test_filtered,
    queries_content,
    documents_content,
    feature_weights_tfidf=feature_weights_tfidf
)

# --- Save Predictions ---
with open(test_predictions_file, 'w') as f:
    json.dump(reranked_predictions, f, indent=4)

print(f"\nCreative test set predictions saved to: {test_predictions_file}")

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0




Necessary data files found.
Loaded and processed 0 query content items.
Loaded and processed 0 document content items.

Creative test set predictions saved to: creative_predictions.json


# 2. Spars ranking with tf-idf vectorizer (from task 1)

## 2.1. Utilis

In [119]:
def load_json_data(file_path):
    with open(file_path, "r") as file:
        contents = json.load(file)
    return contents


def create_tfidf_matrix(citing_dataset, nonciting_dataset, vectorizer=TfidfVectorizer()):
    """
    Creates TF-IDF matrix for the given citing and non-citing datasets based on the specified text column.

    Parameters:
    citing_dataset (json)): DataFrame containing citing patents.
    nonciting_dataset (json): DataFrame containing non-citing patents.
    vectorizer (TfidfVectorizer, optional): TfidfVectorizer object for vectorizing text data.
                                             Defaults to TfidfVectorizer().

    Returns:
    tuple: A tuple containing TF-IDF matrices for citing and non-citing patents respectively.
           (tfidf_matrix_citing, tfidf_matrix_nonciting)
    """
    all_text = [patent['text'] for patent in citing_dataset + nonciting_dataset]

    # Vectorizing descriptions
    print("Vectorizing descriptions...")
    tfidf_matrix = vectorizer.fit_transform(tqdm(all_text, desc="TF-IDF"))

    # Since we're interested in similarities between citing and cited patents,
    # we need to split the TF-IDF matrix back into two parts
    split_index = len(citing_dataset)
    tfidf_matrix_citing = tfidf_matrix[:split_index]
    tfidf_matrix_nonciting = tfidf_matrix[split_index:]

    # Size of vocabulary
    print("Size of vocabulary:", len(vectorizer.vocabulary_))

    return tfidf_matrix_citing, tfidf_matrix_nonciting


# a new function to create the copus
import re

def create_corpus(corpus, text_type):
    """
    Extracts text data from a corpus based on the specified text type.

    Parameters:
    corpus (list): List of dictionaries representing documents.
    text_type (str): Type of text to extract ('title', 'abstract', 'claim1', 'claims', 'description', 'fulltext',
                     'features', 'TAC1', 'TA', 'TAC1F', 'description_F').

    Returns:
    list: List of dictionaries with 'id' and 'text' keys representing each document in the corpus.
    """
    import re  # Ensure re is imported

    valid_app_ids = []
    texts = []
    cnt = 0  # Count of documents without required text
    ids_to_remove = []

    for docBloc in corpus:
        doc_id = docBloc.get('FAN')
        if doc_id is None:
            continue

        doc = docBloc['Content']
        valid = False
        text = ''

        if text_type == 'title':
            if 'title' in doc:
                text = doc['title']
                valid = True
        elif text_type == 'abstract':
            if 'pa01' in doc:
                text = doc['pa01']
                valid = True
        elif text_type == 'claim1':
            if 'c-en-0001' in doc:
                text = doc['c-en-0001']
                valid = True
        elif text_type == 'claims':
            claims = [doc[k] for k in doc if k.startswith('c-en-')]
            if claims:
                text = ' '.join(claims)
                valid = True
        elif text_type == 'description':
            desc_parts = [doc[k] for k in doc if re.match(r'^p\d{4}$', k)]
            if desc_parts:
                text = ' '.join(desc_parts)
                valid = True
        elif text_type == 'fulltext':
            text_parts = [str(doc[k]) for k in doc if k != 'id']
            if text_parts:
                text = ' '.join(text_parts)
                valid = True
        elif text_type == 'features':
            try:
                features_dict = doc['features']
                features = list(features_dict.values())
                if features:
                    text = ' '.join(features)
                    valid = True
                else:
                    raise KeyError  # Treat empty features as missing
            except KeyError:
                print("an error !")
        elif text_type == 'TAC1':
            required = ['title', 'pa01', 'c-en-0001']
            if all(k in doc for k in required):
                text = ' '.join([doc['title'], doc['pa01'], doc['c-en-0001']])
                valid = True
        elif text_type == 'TA':
            required = ['title', 'pa01']
            if all(k in doc for k in required):
                text = ' '.join([doc['title'], doc['pa01']])
                valid = True
        elif text_type == 'TAC1F':
            required = ['title', 'pa01', 'c-en-0001', 'features']
            if all(k in doc for k in required):
                text_parts = [doc['title'], doc['pa01'], doc['c-en-0001']]
                try:
                    features_dict = doc['features']
                    features = list(features_dict.values())
                    if features:
                        text_parts.append(' '.join(features))
                        text = ' '.join(text_parts)
                        valid = True
                    else:
                        raise KeyError
                except KeyError:
                    print("an error !")
        elif text_type == 'description_F':
            desc_parts = [doc[k] for k in doc if re.match(r'^p\d{4}$', k)]
            try:
                features_dict = doc['features']
                features = list(features_dict.values())
                if features:
                    text = ' '.join(desc_parts) + ' ' + ' '.join(features)
                    valid = True
                else:
                    raise KeyError
            except KeyError:
                print("an error !")
        else:
            raise ValueError("Invalid text type")

        if valid:
            valid_app_ids.append(doc_id)
            texts.append(text)
        else:
            cnt += 1
            ids_to_remove.append(doc_id)

    # Print statistics
    print(f"Number of documents without {text_type}: {cnt}")
    if ids_to_remove:
        print(f"Removing {len(ids_to_remove)} documents without required text")

    # Create the corpus data
    corpus_data = [{'id': app_id, 'text': text} for app_id, text in zip(valid_app_ids, texts)]

    return corpus_data



def top_k_ranks(citing, cited, cosine_similarities, k=10):
    # Create a dictionary to store the top k ranks for each citing patent
    top_k_ranks = {}
    for i, content_id in enumerate(citing):
        top_k_ranks[content_id['FAN']] = [cited[j]['FAN'] for j in np.argsort(cosine_similarities[i])[::-1][:k]]
    return top_k_ranks


## 2.2 Upload the data

In [84]:
import json

with open("train_queries.json") as f:
    train_queries = json.load(f)

with open("train_gold_mapping.json") as f:
    train_gold_mapping = json.load(f)

with open("shuffled_pre_ranking.json") as f:
    pre_ranking = json.load(f)

with open("queries_content_with_features.json") as f:
    queries_content = json.load(f)

with open("documents_content_with_features.json") as f:
    docs_content = json.load(f)


**important recall**

The data we need for this task are:

| Dataset                | Role                          | Example Use Case                                                                 |
|------------------------|-------------------------------|----------------------------------------------------------------------------------|
| `json_citing_train`    | Training queries              | Fit TF-IDF/BM25 models and generate rankings for evaluation.                     |
| `json_citing_test`     | Test queries                  | Final evaluation (unseen during training).                                       |
| `json_nonciting`       | Retrieval corpus              | Search space for finding relevant patents.                                       |
| `json_citing_to_cited` | Ground-truth mappings (train) | Validate if top-100 retrieved patents include the true cited patents from `json_nonciting`. |


## 2.3. Creation of the tf-idf vectorizer

### 2.3.1. Description + features

In [117]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# the function

doc_corpus_description_F = create_corpus(docs_content, text_type='description_F')
query_corpus_description_F = create_corpus(queries_content, text_type='description_F')


Number of documents without description_F: 0
Number of documents without description_F: 0


In [120]:
# description + features  Text TF_IDF (training data)

# Create TF-IDF vectorizer
tfidf_vectorizer1 = TfidfVectorizer(
    stop_words='english',
    max_features=60000,
    ngram_range=(1, 1),
    strip_accents='unicode',
    sublinear_tf= True
    )

In [121]:
# apply tfidf
tfidf_matrix_docs, tfidf_matrix_queries = create_tfidf_matrix(doc_corpus_description_F, query_corpus_description_F, tfidf_vectorizer1)

Vectorizing descriptions...


TF-IDF:   0%|          | 0/930 [00:00<?, ?it/s]

Size of vocabulary: 44097


In [122]:
# show the shaps
print(tfidf_matrix_docs.shape)
print(tfidf_matrix_queries.shape)

(900, 44097)
(30, 44097)


In [123]:
# computing the cosin sim
cosine_similarities = cosine_similarity(tfidf_matrix_queries, tfidf_matrix_docs)
cosine_similarities


array([[0.15815984, 0.11145126, 0.15740679, ..., 0.19005678, 0.1404056 ,
        0.1199359 ],
       [0.14495315, 0.12965319, 0.16803329, ..., 0.14603708, 0.11912082,
        0.11616464],
       [0.15546634, 0.1466232 , 0.20957066, ..., 0.15128853, 0.15313469,
        0.15032352],
       ...,
       [0.18621809, 0.18497118, 0.18615221, ..., 0.12256428, 0.13582251,
        0.11543172],
       [0.0850097 , 0.08299926, 0.09220746, ..., 0.101946  , 0.08674313,
        0.06535791],
       [0.19998599, 0.17811146, 0.28175976, ..., 0.2326327 , 0.29343263,
        0.26685374]])

In [124]:
top_k_rank = top_k_ranks(queries_content, docs_content, cosine_similarities, k=10)

In [125]:
with open('prediction2_tfidf_descF.json', 'w') as f:
    json.dump(top_k_rank, f)

### 2.3.2. Descrition

In [131]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# the function

doc_corpus_description_F = create_corpus(docs_content, text_type='description')
query_corpus_description_F = create_corpus(queries_content, text_type='description')

# description + features  Text TF_IDF (training data)
tfidf_matrix_docs, tfidf_matrix_queries = create_tfidf_matrix(doc_corpus_description_F, query_corpus_description_F, tfidf_vectorizer1)

# Create TF-IDF vectorizer
tfidf_vectorizer1 = TfidfVectorizer(
    stop_words='english',
    max_features=60000,
    ngram_range=(1, 1),
    strip_accents='unicode',
    sublinear_tf= True
    )

# computing the cosin sim
cosine_similarities = cosine_similarity(tfidf_matrix_queries, tfidf_matrix_docs)
cosine_similarities

top_k_rank = top_k_ranks(queries_content, docs_content, cosine_similarities, k=10)

with open('prediction2_tfidf_desc.json', 'w') as f:
    json.dump(top_k_rank, f)

Number of documents without description: 0
Number of documents without description: 0
Vectorizing descriptions...


TF-IDF:   0%|          | 0/930 [00:00<?, ?it/s]

Size of vocabulary: 43913


# 3. Algorithim RRF

In [None]:
from collections import defaultdict

def reciprocal_rank_fusion(rankings_list, k=60):
    """
    Combines multiple ranked lists using Reciprocal Rank Fusion

    Args:
        rankings_list (list of dict): List of dictionaries where each dict has
            {query_id: [ordered list of document IDs]}
        k (int): Smoothing parameter (typically 60 by default // a value used by the community)

    Returns:
        dict: {query_id: [ordered list of merged document IDs]}
    """
    fused_rankings = {}

    # Get all unique query IDs
    query_ids = set.intersection(*[set(r.keys()) for r in rankings_list])

    for qid in query_ids:
        doc_scores = defaultdict(float)

        # Calculate RRF scores for each ranking
        for ranking in rankings_list:
            ranked_docs = ranking[qid]
            for rank_pos, doc_id in enumerate(ranked_docs):
                doc_scores[doc_id] += 1 / (k + rank_pos + 1)  # +1 because ranks are 0-indexed

        # Sort documents by descending RRF score
        sorted_docs = sorted(doc_scores.items(), key=lambda x: -x[1])

        # Extract ordered document IDs
        fused_rankings[qid] = [doc_id for doc_id, score in sorted_docs]

    return fused_rankings

def load_ranking(file_path):
    """
    Load a ranking from a JSON file.
    """
    with open(file_path, 'r') as f:
        return json.load(f)


## 3.1. tfidf_descF + MPNET_description

In [None]:

sparse_rankings = load_ranking("prediction2_tfidf_descF.json")
dense_rankings = load_ranking("prediction2_MPNET_description.json")

combined_ranking = reciprocal_rank_fusion([dense_rankings, sparse_rankings], k=10)


# ... (previous code)

# Iterate over the query IDs and their corresponding rankings
for query_id, ranked_docs in combined_ranking.items():
  print(f"Query ID: {query_id}")
  for i, doc_id in enumerate(ranked_docs):
      print(f"Rank {i+1}: {doc_id}") # Print rank and document ID


# Save fused results if needed
output_file = "rrf_prediction2.json"
with open(output_file, 'w') as f:
    json.dump(combined_ranking, f)
print(f"\nSaved fused results to {output_file}")

Reading truth and prediction
Checking Accuracy
Scores:
{'recall_at_3': 0.30873015873015874, 'recall_at_5': 0.34523809523809523, 'recall_at_10': 0.37063492063492065, 'recall_at_20': 0.6797619047619048, 'mean_rank': 14.105952380952383, 'mean_inv_rank': 0.19188916357316593, 'mean_average_precision': 0.27379514070307864}
Scoring completed

## 3.2. tfidf_descF + MPNET_DescFeat

In [132]:

sparse_rankings = load_ranking("prediction2_tfidf_descF.json")
dense_rankings = load_ranking("prediction2_MPNET_DescFeat.json")

combined_ranking = reciprocal_rank_fusion([dense_rankings, sparse_rankings], k=10)


# ... (previous code)

# Iterate over the query IDs and their corresponding rankings
for query_id, ranked_docs in combined_ranking.items():
  print(f"Query ID: {query_id}")
  for i, doc_id in enumerate(ranked_docs):
      print(f"Rank {i+1}: {doc_id}") # Print rank and document ID


# Save fused results if needed
output_file = "rrf_prediction2_tfidfDescF_MPNETDescFeat.json"
with open(output_file, 'w') as f:
    json.dump(combined_ranking, f)
print(f"\nSaved fused results to {output_file}")

Query ID: 103964109
Rank 1: 87488738
Rank 2: 87285519
Rank 3: 104761777
Rank 4: 110338873
Rank 5: 94546339
Rank 6: 101598636
Rank 7: 105078785
Rank 8: 102035322
Rank 9: 84923580
Rank 10: 44437432
Rank 11: 96138054
Rank 12: 92631163
Rank 13: 91358966
Rank 14: 74364787
Rank 15: 89655285
Rank 16: 70494531
Rank 17: 86686331
Rank 18: 93007218
Rank 19: 65451984
Rank 20: 85915967
Rank 21: 81098918
Rank 22: 93196199
Rank 23: 94596291
Rank 24: 1662314
Rank 25: 82807300
Rank 26: 74999904
Rank 27: 73189654
Rank 28: 101974338
Rank 29: 112489610
Rank 30: 91801222
Query ID: 75800075
Rank 1: 34284570
Rank 2: 84214328
Rank 3: 76825949
Rank 4: 75692075
Rank 5: 7588356
Rank 6: 81692381
Rank 7: 74966633
Rank 8: 70999237
Rank 9: 62288211
Rank 10: 43687538
Rank 11: 34173412
Rank 12: 64972313
Rank 13: 73305870
Rank 14: 35300504
Rank 15: 81704710
Rank 16: 87092702
Rank 17: 93085483
Rank 18: 86183849
Rank 19: 22823110
Rank 20: 71238892
Rank 21: 77197418
Rank 22: 62194904
Rank 23: 73750287
Rank 24: 77144269
Ra

## 3.3. tfidf_desc + MPNET_description MPNET_DescFeat

In [136]:

sparse_rankings = load_ranking("prediction2_tfidf_desc.json")
dense_rankings = load_ranking("prediction2_MPNET_description.json")

combined_ranking = reciprocal_rank_fusion([dense_rankings, sparse_rankings], k=10)


# ... (previous code)

# Iterate over the query IDs and their corresponding rankings
for query_id, ranked_docs in combined_ranking.items():
  print(f"Query ID: {query_id}")
  for i, doc_id in enumerate(ranked_docs):
      print(f"Rank {i+1}: {doc_id}") # Print rank and document ID


# Save fused results if needed
output_file = "rrf_prediction2_tfidfDesc_MPNETDesc.json"
with open(output_file, 'w') as f:
    json.dump(combined_ranking, f)
print(f"\nSaved fused results to {output_file}")

Query ID: 103964109
Rank 1: 87488738
Rank 2: 87285519
Rank 3: 104761777
Rank 4: 110338873
Rank 5: 94546339
Rank 6: 101598636
Rank 7: 105078785
Rank 8: 102035322
Rank 9: 84923580
Rank 10: 44437432
Rank 11: 96138054
Rank 12: 92631163
Rank 13: 91358966
Rank 14: 74364787
Rank 15: 89655285
Rank 16: 70494531
Rank 17: 86686331
Rank 18: 93007218
Rank 19: 65451984
Rank 20: 85915967
Rank 21: 81098918
Rank 22: 93196199
Rank 23: 94596291
Rank 24: 1662314
Rank 25: 82807300
Rank 26: 74999904
Rank 27: 73189654
Rank 28: 101974338
Rank 29: 112489610
Rank 30: 91801222
Query ID: 75800075
Rank 1: 34284570
Rank 2: 84214328
Rank 3: 76825949
Rank 4: 75692075
Rank 5: 7588356
Rank 6: 81692381
Rank 7: 74966633
Rank 8: 70999237
Rank 9: 62288211
Rank 10: 43687538
Rank 11: 34173412
Rank 12: 64972313
Rank 13: 73305870
Rank 14: 35300504
Rank 15: 81704710
Rank 16: 87092702
Rank 17: 93085483
Rank 18: 86183849
Rank 19: 22823110
Rank 20: 71238892
Rank 21: 77197418
Rank 22: 62194904
Rank 23: 73750287
Rank 24: 77144269
Ra

## 3.4. tfidf_desc + MPNET_DescFeat

In [135]:

sparse_rankings = load_ranking("prediction2_tfidf_desc.json")
dense_rankings = load_ranking("prediction2_MPNET_DescFeat.json")

combined_ranking = reciprocal_rank_fusion([dense_rankings, sparse_rankings], k=10)

# Iterate over the query IDs and their corresponding rankings
for query_id, ranked_docs in combined_ranking.items():
  print(f"Query ID: {query_id}")
  for i, doc_id in enumerate(ranked_docs):
      print(f"Rank {i+1}: {doc_id}") # Print rank and document ID


# Save fused results if needed
output_file = "rrf_prediction2_tfidfDesc_MPNETDescFeat.json"
with open(output_file, 'w') as f:
    json.dump(combined_ranking, f)
print(f"\nSaved fused results to {output_file}")

Query ID: 103964109
Rank 1: 87488738
Rank 2: 87285519
Rank 3: 104761777
Rank 4: 110338873
Rank 5: 94546339
Rank 6: 101598636
Rank 7: 105078785
Rank 8: 102035322
Rank 9: 84923580
Rank 10: 44437432
Rank 11: 96138054
Rank 12: 92631163
Rank 13: 91358966
Rank 14: 74364787
Rank 15: 89655285
Rank 16: 70494531
Rank 17: 86686331
Rank 18: 93007218
Rank 19: 65451984
Rank 20: 85915967
Rank 21: 81098918
Rank 22: 93196199
Rank 23: 94596291
Rank 24: 1662314
Rank 25: 82807300
Rank 26: 74999904
Rank 27: 73189654
Rank 28: 101974338
Rank 29: 112489610
Rank 30: 91801222
Query ID: 75800075
Rank 1: 34284570
Rank 2: 84214328
Rank 3: 76825949
Rank 4: 75692075
Rank 5: 7588356
Rank 6: 81692381
Rank 7: 74966633
Rank 8: 70999237
Rank 9: 62288211
Rank 10: 43687538
Rank 11: 34173412
Rank 12: 64972313
Rank 13: 73305870
Rank 14: 35300504
Rank 15: 81704710
Rank 16: 87092702
Rank 17: 93085483
Rank 18: 86183849
Rank 19: 22823110
Rank 20: 71238892
Rank 21: 77197418
Rank 22: 62194904
Rank 23: 73750287
Rank 24: 77144269
Ra