In [8]:
import jieba
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from collections import Counter  # 导入Counter类

# 获取停用词
def get_stopwords(stop_file_name):
    with open(stop_file_name, "r", encoding="utf-8") as file:
        lines = file.readlines()
    words = [i.strip() for i in lines]
    return words

# 字符清洗
def text_cleaning(text):
    text_result = ''
    for char in text:
        if '\u4e00' <= char <= '\u9fa5':
            text_result += char
    return text_result

# 数据预处理
def co_data(dataset_path, stopwords):
    labels = []
    texts = []

    with open(dataset_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in tqdm(lines):
            parts = line.strip().split("_!_")
            if len(parts) < 4:  # 确保数据格式正确
                continue
            labels.append(parts[2])
            cleaned_text = text_cleaning(parts[3])
            seg = jieba.cut(cleaned_text, cut_all=False)
            text = [char for char in seg if char not in stopwords]
            texts.append(' '.join(text))  # 用空格连接分词结果，以便CountVectorizer处理

    return texts, labels

# 整理类别和索引
def co_labeldict(labels):
    label_freq = Counter(labels)
    id2label = {i: label for i, label in enumerate(label_freq)}
    label2id = {label: i for i, label in enumerate(label_freq)}
    return id2label, label2id

# 加载停用词
stopwords = get_stopwords(r'C:\Users\HP\OneDrive\桌面\智能信息网络实验\chinese_news_classification\dataset\cn_stopwords.txt')

# 数据预处理
dataset_path = r'C:\Users\HP\OneDrive\桌面\智能信息网络实验\chinese_news_classification\dataset\dataset.txt'
texts, labels = co_data(dataset_path, stopwords)

# 创建标签索引映射
id2label, label2id = co_labeldict(labels)

# 使用CountVectorizer进行文本向量化
vectorizer = CountVectorizer(max_features=7000)

# 划分数据集
train_texts, rest_texts, train_labels, rest_labels = train_test_split(texts, labels, test_size=0.4, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(rest_texts, rest_labels, test_size=0.5, random_state=42)

# 向量化文本数据
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)
X_test = vectorizer.transform(test_texts)

# 将标签转换为索引
y_train = [label2id[label] for label in train_labels]
y_val = [label2id[label] for label in val_labels]
y_test = [label2id[label] for label in test_labels]


100%|██████████| 382688/382688 [00:49<00:00, 7661.57it/s]


In [12]:
from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator

class MachineLearningEnsemble(BaseEstimator):
    def __init__(self):
        # 初始化机器学习模型
        self.models = {
            'sgd': SGDClassifier(),
            'pa': PassiveAggressiveClassifier(),
            'svc': LinearSVC(),
            'ridge': RidgeClassifier(),
            'gb': GradientBoostingClassifier()
        }
        # 初始化投票分类器
        self.voting_classifier = VotingClassifier(
            estimators=[(name, model) for name, model in self.models.items()],
            voting='hard'
        )
        
    def fit(self, X, y):
        # 训练每个独立的机器学习模型
        for model in self.models.values():
            model.fit(X, y)
        
        # 训练投票分类器
        self.voting_classifier.fit(X, y)
        
    def predict(self, X):
        # 使用投票分类器进行预测
        return self.voting_classifier.predict(X)
    
    def score(self, X, y):
        # 评估投票分类器的准确率
        return accuracy_score(y, self.predict(X))


In [13]:
# 创建集成模型的实例
ensemble_model = MachineLearningEnsemble()

# 训练集成模型
ensemble_model.fit(X_train, y_train)

# 在验证集上评估集成模型的性能
val_accuracy = ensemble_model.score(X_val, y_val)
print(f"Validation Accuracy: {val_accuracy:.4f}")


Validation Accuracy: 0.7988


In [14]:
# 如果你想评估每个单独模型的性能
for name, model in ensemble_model.models.items():
    model_accuracy = model.score(X_val, y_val)
    print(f"{name} Validation Accuracy: {model_accuracy:.4f}")

sgd Validation Accuracy: 0.7938
pa Validation Accuracy: 0.7535
svc Validation Accuracy: 0.8002
ridge Validation Accuracy: 0.7919
gb Validation Accuracy: 0.6655


In [15]:
# 在测试集上评估集成模型的性能
test_accuracy = ensemble_model.score(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")


Test Accuracy: 0.8008


In [16]:
from joblib import dump, load

# 保存模型
dump(ensemble_model, 'ensemble_model.joblib')

['ensemble_model.joblib']