In [None]:
import tkinter as tk
from tkinter import filedialog, messagebox
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier


class TextClassifierGUI:
    def __init__(self, master):
        self.master = master
        master.title("文本分类GUI")

        # 创建选择Excel文件按钮
        self.file_path = ''
        self.file_button = tk.Button(master, text="选择文件", command=self.select_file)
        self.file_button.grid(row=0, column=0)

        # 创建开始训练按钮
        self.train_button = tk.Button(master, text="开始训练", command=self.train_model)
        self.train_button.grid(row=0, column=1)

        # 创建输入文本框
        self.text_label = tk.Label(master, text="输入文本：")
        self.text_label.grid(row=1, column=0)
        self.text_entry = tk.Entry(master)
        self.text_entry.grid(row=1, column=1)

        # 创建分类按钮
        self.classify_button = tk.Button(master, text="分类", command=self.classify_text)
        self.classify_button.grid(row=2, column=1)

        # 创建输出标签框
        self.result_label = tk.Label(master, text="分类结果：")
        self.result_label.grid(row=3, column=0)
        self.result_text = tk.Text(master, width=50, height=10)
        self.result_text.grid(row=4, column=0, columnspan=2)

        # 初始化变量
        self.stopwords = []
        self.model = None
        self.vectorizer = None
        self.labels = None

    def select_file(self):
        self.file_path = filedialog.askopenfilename()

    def train_model(self):
        # 读取Excel文件
        try:
            data = pd.read_excel(self.file_path)
            data = data[1:]
        except FileNotFoundError:
            messagebox.showwarning("警告", "请先选择要读取的Excel文件！")
            return

        # 合并标题和正文
        data['text'] = data['标题'] + ' ' + data['内容']

        # 加载停用词列表
        with open("./stopword.txt", "r", encoding="utf-8") as f:
            self.stopwords = [line.strip() for line in f]

        # 分词、去除停用词和数字、切分句子并统计长度
        data['text'] = data['text'].apply(lambda x: self.tokenize(x))
        data['text'] = data['text'].apply(lambda x: [w for w in x if w not in self.stopwords and not w.isdigit()])
        data['text'] = data['text'].apply(lambda x: ' '.join(x))
        data['count'] = data['text'].apply(lambda x: len(x.split()))

        # 标签编码
        data['label']=0
        for i in range(1,len(data)+1):
            if 'dq' in data['链接'][i]:
                data.loc[i, 'label']=1
            elif 'xq'in data['链接'][i]:
                data.loc[i, 'label']=2
            elif 'rhq' in data['链接'][i]:
                data.loc[i, 'label']=3
            elif 'my' in data['链接'][i]:
                data.loc[i, 'label']=4
        # TF-IDF向量化
        self.vectorizer = TfidfVectorizer(max_features=5000)
        x_train = self.vectorizer.fit_transform(data['text'].tolist())

        # KNN分类训练
        self.model = KNeighborsClassifier()
        self.model.fit(x_train, data['label'])

        # 建立标签与编码之间的映射
        self.labels = pd.Categorical(data['label']-1).categories
        #.apply(lambda x: x.split('/')[-1][:-5])

        # 提示训练完成
        messagebox.showinfo("训练完成", "文本分类模型已训练完毕！")

    def classify_text(self):
        if not self.model:
            messagebox.showwarning("警告", "请先训练分类模型！")
            return

        # 文本分类
        text = self.text_entry.get()
        x_test = self.vectorizer.transform(self.tokenize(text))
        y_pred = self.model.predict(x_test)

        # 显示分类结果
        label = self.labels[y_pred[0]]
        if label==1:
            label='东区新闻'
        elif label==2:
            label='西区新闻'
        elif label==3:
            label='仁和区新闻'
        elif label==4:
            label='米易新闻'
        self.result_text.delete("1.0", "end")
        self.result_text.insert("end", label)

    @staticmethod
    def tokenize(text):
        # 分词及去除标点符号和空格
        seg_list = jieba.cut(text)
        tokens = []
        for token in seg_list:
            token = token.strip()
            if token != '' and not token.isdigit():
                tokens.append(token)
        return tokens


if __name__ == '__main__':
    root = tk.Tk()
    my_gui = TextClassifierGUI(root)
    root.mainloop()
