In [None]:
# The draft GUI of Search Engine
import tkinter as tk
from tkinter import ttk

# 创建主窗口
root = tk.Tk()
root.title("Search Engine")

# 创建输入框
entry = tk.Entry(root, width=50)
entry.pack(pady=10)

# 创建标题标签（初始隐藏）
title_label = tk.Label(root, text="", font=("Arial", 14, "bold"))
title_label.pack(pady=5)
title_label.pack_forget()

# 定义表格变量
tree = None  
user_query = "" # 存储用户输入

# mock the result of search engine
def mock_search_engine(query):
    return {
        f"related text {i}": round(1 - i * 0.1, 2)  # 生成相关度递减的示例数据
        for i in range(1, 11)  # 生成10个文本
    }

# 按钮回调函数
def on_search(event=None):
    global tree, user_query  
    user_query = entry.get().strip()  

    if not user_query:  # 避免空输入
        return

    # 调用搜索引擎（这里用 mock_search_engine 代替）
    search_results = mock_search_engine(user_query)

    # 设置标题并显示
    title_label.config(text=f"『The result of search {user_query}』")
    title_label.pack()

    # 如果表格已存在，先删除
    if tree:
        tree.destroy()

    # 创建 Treeview 表格
    columns = ("Order", "Text", "Similarity")
    tree = ttk.Treeview(root, columns=columns, show="headings")

    # 设置列标题
    tree.heading("Order", text="Order")
    tree.heading("Text", text="Text")
    tree.heading("Similarity", text="Similarity")

    # 设置列的宽度
    tree.column("Order", width=50, anchor="center")
    tree.column("Text", width=200, anchor="w")
    tree.column("Similarity", width=100, anchor="center")

    # 插入搜索结果
    for idx, (text, similarity) in enumerate(search_results.items(), start=1):
        tree.insert("", "end", values=(idx, text, similarity))

    tree.pack(pady=10)

    print(f"The user query is: {user_query}")  # 调试用

# 创建按钮
button = tk.Button(root, text="Search", command=on_search)
button.pack(pady=10)

# 绑定回车键
root.bind("<Return>", on_search)

# 运行 GUI
root.mainloop()

In [1]:
# download PyTerrier
!pip install python-terrier




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pyterrier as pt
import pandas as pd

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Downlaod resources
nltk.download("punkt")
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Define Preprocess function
def preprocess(text):    
    # Lowercase
    text = text.lower()
    
    # tokennization
    words = word_tokenize(text)
    
    # remove stopwords
    filtered_words = [word for word in words if word not in stop_words]
    
    # connect the result
    return " ".join(filtered_words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\郭丁恺\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\郭丁恺\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# The original dataset
df = pd.read_csv("Reuter_test.csv")
df.head(10)

Unnamed: 0,text,text_type,topics,lewis_split,cgis_split,old_id,new_id,places,people,orgs,exchanges,date,title
0,Mounting trade friction between the\nU.S. And ...,"""NORM""",['trade'],"""TEST""","""TRAINING-SET""","""3809""","""14826""",['hong-kong' 'usa' 'japan' 'taiwan' 'malaysia'...,[],[],[],8-APR-1987 01:03:47.52,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
1,A survey of 19 provinces and seven cities\nsho...,"""NORM""",['grain'],"""TEST""","""TRAINING-SET""","""3811""","""14828""",['china'],[],[],[],8-APR-1987 01:19:17.29,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STOCKS
2,The Ministry of International Trade and\nIndus...,"""NORM""",['crude' 'nat-gas'],"""TEST""","""TRAINING-SET""","""4356""","""14829""",['japan'],[],[],[],8-APR-1987 01:22:17.25,JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS
3,Thailand's trade deficit widened to 4.5\nbilli...,"""NORM""",['trade' 'grain' 'rice' 'corn' 'sugar' 'tin' '...,"""TEST""","""TRAINING-SET""","""3815""","""14832""",['thailand'],[],[],[],8-APR-1987 01:45:09.09,THAI TRADE DEFICIT WIDENS IN FIRST QUARTER
4,Indonesia expects crude palm oil (CPO)\nprices...,"""NORM""",['veg-oil' 'palm-oil'],"""TEST""","""TRAINING-SET""","""3816""","""14833""",['indonesia' 'malaysia'],[],[],[],8-APR-1987 01:48:20.11,INDONESIA SEES CPO PRICE RISING SHARPLY
5,,"""BRIEF""",[],"""TEST""","""TRAINING-SET""","""3818""","""14835""",[],[],[],[],8-APR-1987 02:11:57.43,Japan four-year note auction average yield rec...
6,"Tug crews in New South Wales (NSW),\nVictoria ...","""NORM""",['ship'],"""TEST""","""TRAINING-SET""","""3822""","""14839""",['australia'],[],[],[],8-APR-1987 02:42:14.86,AUSTRALIAN FOREIGN SHIP BAN ENDS BUT NSW PORTS...
7,The Indonesian Commodity Exchange is\nlikely t...,"""NORM""",['veg-oil' 'palm-oil' 'lumber' 'coffee' 'rubber'],"""TEST""","""TRAINING-SET""","""4358""","""14840""",['indonesia' 'south-korea' 'taiwan'],[],[],[],8-APR-1987 02:56:07.22,INDONESIAN COMMODITY EXCHANGE MAY EXPAND
8,Food Department officials said the U.S.\nDepar...,"""NORM""",['grain' 'wheat'],"""TEST""","""TRAINING-SET""","""3824""","""14841""",['sri-lanka' 'usa'],[],[],[],8-APR-1987 03:09:59.77,SRI LANKA GETS USDA APPROVAL FOR WHEAT PRICE
9,Western Mining Corp Holdings Ltd\n&lt;WMNG.S> ...,"""NORM""",['gold'],"""TEST""","""TRAINING-SET""","""3825""","""14842""",['australia'],[],[],[],8-APR-1987 03:21:39.74,WESTERN MINING TO OPEN NEW GOLD MINE IN AUSTRALIA


In [5]:
# Load the preprocessed dataset
# Load the dataset with selected column
df = pd.read_csv("Reuter_test.csv", usecols=["title","text","old_id"])
df.rename(columns = {"old_id": "docno"}, inplace = True)
new_order = ["docno", "title", "text"]
df = df[new_order]

# drop the data which its text is NaN 
df = df.dropna(subset=["text","title"])

# Preprocess the text and title field
df['text'] = df['text'].apply(preprocess)
df['title'] = df['title'].apply(preprocess)
# Save cleaned dataset 
df.to_csv("cleaned_dataset.csv")

# Display first 10 data
df.head(10)

Unnamed: 0,docno,title,text
0,"""3809""",asian exporters fear damage u.s.-japan rift,mounting trade friction u.s. japan raised fear...
1,"""3811""",china daily says vermin eat 7-12 pct grain stocks,survey 19 provinces seven cities showed vermin...
2,"""4356""",japan revise long-term energy demand downwards,ministry international trade industry ( miti )...
3,"""3815""",thai trade deficit widens first quarter,thailand 's trade deficit widened 4.5 billion ...
4,"""3816""",indonesia sees cpo price rising sharply,indonesia expects crude palm oil ( cpo ) price...
6,"""3822""",australian foreign ship ban ends nsw ports hit,"tug crews new south wales ( nsw ) , victoria w..."
7,"""4358""",indonesian commodity exchange may expand,indonesian commodity exchange likely start tra...
8,"""3824""",sri lanka gets usda approval wheat price,food department officials said u.s. department...
9,"""3825""",western mining open new gold mine australia,western mining corp holdings ltd & lt ; wmng.s...
10,"""3826""",sumitomo bank aims quick recovery merger,sumitomo bank ltd & lt ; sumi.t > certain lose...


In [6]:
# index
df = pd.read_csv("cleaned_dataset.csv")

# 确保 CSV 包含 'docno' 和 'text' 两列
if 'docno' not in df.columns or 'text' not in df.columns:
    raise ValueError("CSV 文件必须包含 'docno' 和 'text' 两列")

# 将 DataFrame 转换为可迭代字典格式
doc_iter = df.to_dict(orient="records")

# 创建索引器
indexer = pt.IterDictIndexer("./index", meta={'docno': 20, 'text': 4096})

# 生成索引
indexref = indexer.index(doc_iter)

# 加载索引并打印统计信息
index = pt.IndexFactory.of(indexref)
print(index.getCollectionStatistics().toString())

JavaException: JVM exception occurred: org/terrier/python/PTUtils java.lang.NoClassDefFoundError