In [1]:
!pip install python-terrier
!pip install gradio
import pyterrier as pt
import pandas as pd
pt.init()

Collecting gradio
  Downloading gradio-5.23.3-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


In [2]:
!rm -rf ./index

# Make sure this dataset exist before running
df = pd.read_csv("cleaned_dataset.csv")

# Make sure the dataset contains columns of 'docno' and 'text'
if 'docno' not in df.columns or 'text' not in df.columns:
    raise ValueError("CSV file must include columns of 'docno' 和 'text' ")

# Transfer dataframe to dictionary form
doc_iter = df.to_dict(orient="records")

# Create Indexer
indexer = pt.IterDictIndexer("./index", meta={'docno': 20, 'text': 4096}, fields=['text'])

# Genrate Index
indexref = indexer.index(doc_iter)

# load index and print it
index = pt.IndexFactory.of(indexref)
print(index.getCollectionStatistics().toString())

Number of documents: 3009
Number of terms: 11620
Number of postings: 164596
Number of fields: 1
Number of tokens: 253171
Field names: [text]
Positions:   false



In [3]:
import gradio as gr

# The search function
def search_bm25(query):
    if not query.strip():
        return "Please enter a query."
    result_df = pt.BatchRetrieve(indexref, wmodel="BM25").search(query)
    return result_df.head(10) # select first 10 data to show in interface

def search_bm25f(query):
    if not query.strip():
        return "Please enter a query."
    result_df = pt.BatchRetrieve(indexref, wmodel="BM25F", properties = {'w.0': 0.5, 'w.1': 1}).search(query)
    return result_df.head(10) # select first 10 data to show in interface

def search(query, method):
    if not query.strip():
        return "Please enter a query."

    if method == "BM25":
        return search_bm25(query)
    elif method == "BM25F":
        return search_bm25f(query)
    else:
        return "Invalid method selected."

# Interface
iface = gr.Interface(
    fn=search,
    inputs=[
        "text",  # Enter the query and search method
        gr.Radio(["BM25", "BM25F"], label="Choose Search Method")  # Choose search method
    ],
    outputs="dataframe",  # Output
    title="News Search Engine",
    description="Enter a query and choose a search method. Returns the top 50 results."
)

iface.launch(share=True)  # Launch interface

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f5209644c4157e8763.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


