In [35]:
import pandas as pd
import re
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
from sklearn.exceptions import UndefinedMetricWarning

# 忽略 UndefinedMetricWarning 警告
warnings.filterwarnings('ignore', category=UndefinedMetricWarning)

# 加载数据
data = pd.read_excel(r"E:\Desktop\自有品分类_二稿2.xlsx")
data = data[["商品名称","一级类目","二级类目"]]

# 去除【】及其内的文字
data['商品名称'] = data['商品名称'].apply(lambda x: re.sub(r'【.*?】', '', x))
data=data[data["二级类目"].notna()]
# 删除重复项
data.drop_duplicates(subset=['商品名称'], inplace=True)
# 分词并去除停用词
stopwords = set(line.strip() for line in open(r'E:\wb\data\stopwords.txt', encoding='utf-8')) # 需要自行准备停用词表


def tokenize_and_remove_stopwords(text):
    words = jieba.lcut(text)
    return [word for word in words if word not in stopwords]

data['tokenized_name'] = data['商品名称'].apply(tokenize_and_remove_stopwords)

# 将分词结果重新组合成字符串
data['processed_name'] = data['tokenized_name'].apply(lambda x: ' '.join(x))

# 使用TF-IDF将文本数据转换为数值型特征
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['processed_name'])
y = data['一级类目']

# 划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from joblib import dump

# 定义模型
models = {
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(random_state=42)
}

# 训练并评估每个模型
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(f"Model: {name}")
    print(classification_report(y_test, predictions))
    
    # 保存表现最好的模型（这里假设随机森林效果最好）
    if name == "Random Forest":
        best_model = model
        dump(best_model, 'random_forest_model.joblib')
        dump(vectorizer, 'tfidf_vectorizer.joblib')

Model: Naive Bayes
              precision    recall  f1-score   support

        业务推广       1.00      0.42      0.59        24
        中学教辅       0.85      0.36      0.51        61
        儿童读物       0.93      0.96      0.95       345
        学习用品       0.00      0.00      0.00        11
        学前启蒙       0.93      0.68      0.78        37
        小学教辅       0.81      0.99      0.89       453
         工具书       1.00      0.18      0.31        11
        控笔练字       1.00      0.82      0.90        33
        数学思维       0.97      0.78      0.86        40
        玩具用品       0.00      0.00      0.00         3
        益智游戏       1.00      0.55      0.71        20
        阅读写作       0.89      0.91      0.90       138
        高中教辅       0.00      0.00      0.00         6

    accuracy                           0.87      1182
   macro avg       0.72      0.51      0.57      1182
weighted avg       0.87      0.87      0.86      1182

Model: SVM
              precision    recall  f1-score   sup

In [37]:
from joblib import load

# 加载保存的模型和向量化器
loaded_model = load('random_forest_model.joblib')
loaded_vectorizer = load('tfidf_vectorizer.joblib')

# 准备新数据
new_data = ["2024一本·小学语文寒假阅读4年级"]
new_data_processed = [' '.join(tokenize_and_remove_stopwords(item)) for item in new_data]
new_X = loaded_vectorizer.transform(new_data_processed)

# 进行预测
predictions = loaded_model.predict(new_X)
print(predictions)

['假期衔接']


In [38]:
# import pandas as pd
# import re
# import jieba
# from sklearn.feature_extraction.text import TfidfVectorizer
# from joblib import load

# 加载模型和向量化器
loaded_model = load('random_forest_model.joblib')
loaded_vectorizer = load('tfidf_vectorizer.joblib')

# 重新读取原始数据
data = pd.read_excel(r"E:\Desktop\自有品分类_二稿2.xlsx")

# 去除【】及其内的文字
def remove_brackets(text):
    return re.sub(r'【.*?】', '', text)

# 分词并去除停用词
stopwords = set(line.strip() for line in open(r'E:\wb\data\stopwords.txt', encoding='utf-8')) # 需要自行准备停用词表


def tokenize_and_remove_stopwords(text):
    words = jieba.lcut(text)
    return [word for word in words if word not in stopwords]

# 对商品名称列进行处理
data['processed_name'] = data['商品名称'].apply(remove_brackets).apply(tokenize_and_remove_stopwords)
data['processed_name'] = data['processed_name'].apply(lambda x: ' '.join(x))

# 使用保存的向量化器转换文本数据
X_new = loaded_vectorizer.transform(data['processed_name'])

# 进行预测
predictions = loaded_model.predict(X_new)

# 将预测结果添加到新的列中
data['预测一级类目'] = predictions

# 导出更新后的表格
output_path = r"E:\Desktop\自有品分类_预测结果1.xlsx"
data.to_excel(output_path, index=False)

print(f"预测完成，结果已保存到 {output_path}")

预测完成，结果已保存到 E:\Desktop\自有品分类_预测结果1.xlsx
