In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

# 读取数据
train_df = pd.read_csv("train_text.csv")
test_df = pd.read_csv("test_text.csv")

In [2]:
# 特征提取（TF-IDF）
vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\d+', 
                             ngram_range=(1,3), 
                             max_features=10000,
                    
                            )#ngram_range=(1,n)
X = vectorizer.fit_transform(train_df['文本'].astype(str))
X_test = vectorizer.transform(test_df['文本'].astype(str))
y = train_df['标签']

In [3]:
# 使用GridSearchCV调参
print("🔍 正在进行参数搜索...")
param_grid = {
    'C': [ 0.2,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29, 0.3,0.31,0.32,0.33,0.34,0.35,0.36,0.37,0.38,0.39, 0.4]
}
scorer = make_scorer(f1_score, average='macro')
grid = GridSearchCV(
    LinearSVC(class_weight='balanced', max_iter=10000),
    param_grid=param_grid,
    scoring=scorer,
    cv=10,#cv
    n_jobs=-1,
    verbose=2
)
grid.fit(X, y)

best_C = grid.best_params_['C']
print(f"✅ 最佳 C: {best_C}")
print(f"✅ 对应 F1: {grid.best_score_:.4f}")

🔍 正在进行参数搜索...
Fitting 10 folds for each of 21 candidates, totalling 210 fits
✅ 最佳 C: 0.29
✅ 对应 F1: 0.7973


In [4]:
cv_splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
bag_model = BaggingClassifier(
    estimator=LinearSVC(C=best_C, class_weight='balanced', max_iter=10000),
    n_estimators=30,#n_estimators
    max_samples=0.8,#max_samples
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)

val_scores = cross_val_score(
    bag_model, X, y,
    cv=cv_splitter,
    scoring=scorer,
    n_jobs=-1
)
print(f"\n🎯 Bagging 平均交叉验证 F1: {val_scores.mean():.4f}")


🎯 Bagging 平均交叉验证 F1: 0.8042


In [5]:
# 用全数据拟合模型
bag_model.fit(X, y)

# 预测测试集
test_pred = bag_model.predict(X_test)

# 保存 submission
submission = pd.DataFrame({
    '新闻ID': test_df['新闻ID'],
    '标签': test_pred
})
submission.to_csv("submission.csv", index=False)

[CV] END .............................................C=0.22; total time=   0.3s
[CV] END .............................................C=0.25; total time=   0.3s
[CV] END .............................................C=0.28; total time=   0.4s
[CV] END .............................................C=0.32; total time=   0.3s
[CV] END .............................................C=0.35; total time=   0.3s
[CV] END .............................................C=0.38; total time=   0.3s
[CV] END ..............................................C=0.2; total time=   0.2s
[CV] END .............................................C=0.23; total time=   0.3s
[CV] END .............................................C=0.25; total time=   0.3s
[CV] END .............................................C=0.29; total time=   0.4s
[CV] END .............................................C=0.32; total time=   0.3s
[CV] END .............................................C=0.35; total time=   0.3s
[CV] END ...................