In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
# 设置模型与向量器
lr_clf = LogisticRegression(solver='liblinear', C=10, random_state=42, max_iter=1000)
tfidf = TfidfVectorizer(
        max_features=5000,
        min_df=5,
        max_df=0.7,
        ngram_range=(1, 2)
    )

# 读取已清洗的数据
import os
train_path = os.path.join('dataset', 'twitter_training_cleaned.csv')
val_path = os.path.join('dataset', 'twitter_validation_cleaned.csv')
if not os.path.exists(train_path) or not os.path.exists(val_path):
    raise FileNotFoundError(f"Missing data files. Expected: {train_path} and {val_path}")
train = pd.read_csv(train_path)
val = pd.read_csv(val_path)

#训练集
X_train = train['processed_text']  
y_train = train['attitude']       

#测试集
X_val = val['processed_text']  
y_val = val['attitude']         

# 将字符串标签编码为整数（XGBoost 等库要求整数标签）
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

X_train_tfidf = tfidf.fit_transform(X_train)

X_val_tfidf = tfidf.transform(X_val)

print(f"训练集特征维度: {X_train_tfidf.shape}")
print(f"测试集特征维度: {X_val_tfidf.shape}")

训练集特征维度: (72097, 5000)
测试集特征维度: (1000, 5000)


In [3]:
import os
output_dir = 'baseline model'
os.makedirs(output_dir, exist_ok=True)

lr_clf.fit(X_train_tfidf, y_train)
lr_val_accuracy = lr_clf.score(X_val_tfidf, y_val)
print(f"验证集准确率: {lr_val_accuracy:.4f}")

# 保存模型与向量器
import joblib
joblib.dump(lr_clf, os.path.join(output_dir, 'LogisticRegressionModel_1.pkl'))
joblib.dump(tfidf, os.path.join(output_dir, 'TfidfVectorizer_1.pkl'))

验证集准确率: 0.8290


['baseline model/TfidfVectorizer_1.pkl']

In [11]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier

# 使用 Bagging 包装 LinearSVC 来实现并行
bagging_svm = BaggingClassifier(
    estimator=LinearSVC(C=1, random_state=42),
    n_estimators=10,        # 并行训练10个模型
    max_samples=0.8,        # 每个模型使用80%的数据
    random_state=42,
    n_jobs=-1              # 并行训练！
)

bagging_svm.fit(X_train_tfidf, y_train)

svm_val_accuracy = bagging_svm.score(X_val_tfidf, y_val)
print(f"Bagging SVM验证集准确率: {svm_val_accuracy:.4f}")

joblib.dump(bagging_svm, os.path.join(output_dir, 'SVM_Model.pkl'))

Bagging SVM验证集准确率: 0.8250


['baseline model/SVM_Model.pkl']

In [12]:
from sklearn.svm import SVC


# 使用 Bagging 包装 SVC 实现多线程并行
bagging_svm = BaggingClassifier(
    estimator=SVC(kernel='rbf', C=1, gamma='scale', random_state=42),
    n_estimators=10,           # 并行训练10个SVM模型
    max_samples=0.8,           # 每个SVM使用80%的训练数据
    max_features=0.8,          # 每个SVM使用80%的特征
    bootstrap=True,            # 使用自助采样
    bootstrap_features=False,  # 特征不重复采样
    random_state=42,
    n_jobs=-1,                 # 使用所有CPU核心并行训练
    verbose=1                  # 显示训练进度
)

# 训练模型
bagging_svm.fit(X_train_tfidf, y_train)

# 评估模型
svm2_val_accuracy = bagging_svm.score(X_val_tfidf, y_val)
print(f"Bagging SVM RBF验证集准确率: {svm2_val_accuracy:.4f}")

# 保存模型
joblib.dump(bagging_svm, os.path.join(output_dir, 'Bagging_SVM_RBF_Model.pkl'))

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed: 10.1min remaining: 30.2min
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed: 17.5min finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    4.2s remaining:   12.6s


Bagging SVM RBF验证集准确率: 0.9620


[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    6.5s finished


['baseline model/Bagging_SVM_RBF_Model.pkl']

In [13]:
from sklearn.ensemble import (
    RandomForestClassifier, 
    AdaBoostClassifier, 
    GradientBoostingClassifier
)


In [14]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train_tfidf, y_train)

rf_val_accuracy = rf.score(X_val_tfidf, y_val)
print(f"随机森林验证集准确率: {rf_val_accuracy:.4f}")

# 保存模型与向量器
joblib.dump(rf, os.path.join(output_dir, 'RandomForest_Model.pkl'))

随机森林验证集准确率: 0.9590


['baseline model/RandomForest_Model.pkl']

In [18]:


import xgboost as xgb

In [20]:
xgboost_clf = xgb.XGBClassifier(n_estimators=100, 
                                learning_rate=1, 
                                use_label_encoder=False, 
                                eval_metric='logloss', 
                                random_state=42,
                                max_depth=20,
                                tree_method='hist',
                                verbosity=1,
                                n_jobs=-1,  )
xgboost_clf.fit(X_train_tfidf, y_train)

xgboost_val_accuracy = xgboost_clf.score(X_val_tfidf, y_val)
print(f"XGBoost验证集准确率: {xgboost_val_accuracy:.4f}")

# 保存模型与向量器
joblib.dump(xgboost_clf, os.path.join(output_dir, 'XGBoost_Model.pkl'))

Parameters: { "use_label_encoder" } are not used.



XGBoost验证集准确率: 0.9320


['baseline model/XGBoost_Model.pkl']

In [21]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=6, n_jobs=-1)

knn.fit(X_train_tfidf, y_train)

knn_val_accuracy = knn.score(X_val_tfidf, y_val)
print(f"KNN验证集准确率: {knn_val_accuracy:.4f}")

# 保存模型与向量器
joblib.dump(knn, os.path.join(output_dir, 'KNN_Model.pkl'))

KNN验证集准确率: 0.9360


['baseline model/KNN_Model.pkl']