In [None]:
!pip install python-terrier
!pip install gradio
import pyterrier as pt
import pandas as pd
pt.init()



Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


In [None]:
!rm -rf ./index

# Make sure this dataset exist before running
df = pd.read_csv("cleaned_dataset.csv")

# Make sure the dataset contains columns of 'docno' and 'text'
if 'docno' not in df.columns or 'text' not in df.columns:
    raise ValueError("CSV file must include columns of 'docno' 和 'text' ")

# Transfer dataframe to dictionary form
doc_iter = df.to_dict(orient="records")

# Create Indexer
indexer = pt.IterDictIndexer("./index", meta={'docno': 20, 'title': 512, 'text': 4096}, text_attrs=['title', 'text'],fields=True)

# Genrate Index
indexref = indexer.index(doc_iter)

# load index and print it
index = pt.IndexFactory.of(indexref)
print(index.getCollectionStatistics().toString())

Number of documents: 3009
Number of terms: 12984
Number of postings: 173368
Number of fields: 2
Number of tokens: 273340
Field names: [title, text]
Positions:   false



In [None]:
import gradio as gr

# The search function
def search_bm25(query):
    if not query.strip():
        return "Please enter a query."
    result_df = pt.BatchRetrieve(indexref, wmodel="BM25").search(query)
    return result_df.head(10) # select first 10 data to show in interface

def search_bm25f(query):
    if not query.strip():
        return "Please enter a query."
    result_df = pt.BatchRetrieve(indexref, wmodel="BM25F", properties = {'w.0': 0.5, 'w.1': 1}).search(query)
    return result_df.head(10) # select first 10 data to show in interface

def search(query, method):
    if not query.strip():
        return "Please enter a query."

    if method == "BM25":
        return search_bm25(query)
    elif method == "BM25F":
        return search_bm25f(query)
    else:
        return "Invalid method selected."

# Interface
iface = gr.Interface(
    fn=search,
    inputs=[
        "text",  # Enter the query and search method
        gr.Radio(["BM25", "BM25F"], label="Choose Search Method")  # Choose search method
    ],
    outputs="dataframe",  # Output
    title="News Search Engine",
    description="Enter a query and choose a search method. Returns the top 50 results."
)

iface.launch(share=True)  # Launch interface

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1af62165e33903870f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


