In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from imblearn.over_sampling import SMOTE
import warnings

# 捕获警告的函数
def catch_xgboost_warnings():
    # 配置警告过滤器
    warnings.simplefilter("always")  # 捕捉所有警告
    with warnings.catch_warnings(record=True) as caught_warnings:
        try:
            # 训练 XGBoost 模型
            xgb_model.tune_and_train(xgb_param_grid)
        except Exception as e:
            print(f"Exception occurred: {e}")
        
        # 检查是否捕获到特定警告
        for warning in caught_warnings:
            if "A worker stopped while some jobs were given to the executor" in str(warning.message):
                print(f"Caught Loky warning: {warning.message}")
            else:
                print(f"Other warning: {warning.message}")


class ModelTrainer:
    def __init__(self, df, features, target, test_size=0.2, random_state=42):
        self.df = df
        self.features = features
        self.target = target
        self.test_size = test_size
        self.random_state = random_state
        
        self.X = self.df[features]  # 输入特征
        self.y = self.df[target].astype(int)  # 目标变量
        
        # 划分训练集和测试集
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=self.test_size, random_state=self.random_state)
        
        # 标准化特征
        self.scaler = StandardScaler()
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)

    def train_and_evaluate(self, model, param_grid=None):
        if param_grid:
            grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
            grid_search.fit(self.X_train_scaled, self.y_train)
            print(f"Best parameters: {grid_search.best_params_}")
            best_model = grid_search.best_estimator_
        else:
            model.fit(self.X_train_scaled, self.y_train)
            best_model = model

        # 预测和评估
        y_pred = best_model.predict(self.X_test_scaled)
        print("Classification Report:")
        print(classification_report(self.y_test, y_pred))
        print("Accuracy:", accuracy_score(self.y_test, y_pred))

    def oversample(self):
        smote = SMOTE(random_state=self.random_state)
        self.X_train_resampled, self.y_train_resampled = smote.fit_resample(self.X_train_scaled, self.y_train)
        return self.X_train_resampled, self.y_train_resampled

class LogisticRegressionModel(ModelTrainer):
    def __init__(self, df, features, target, test_size=0.2, random_state=42):
        super().__init__(df, features, target, test_size, random_state)
        self.model = LogisticRegression(max_iter=2000, solver='lbfgs')

    def tune_and_train(self, param_grid):
        self.train_and_evaluate(self.model, param_grid)

class KNNModel(ModelTrainer):
    def __init__(self, df, features, target, test_size=0.2, random_state=42):
        super().__init__(df, features, target, test_size, random_state)
        self.model = KNeighborsClassifier()

    def tune_and_train(self, param_grid):
        self.train_and_evaluate(self.model, param_grid)

class XGBoostModel(ModelTrainer):
    def __init__(self, df, features, target, test_size=0.2, random_state=42):
        super().__init__(df, features, target, test_size, random_state)
        self.model = xgb.XGBClassifier(eval_metric='mlogloss')

    def tune_and_train(self, param_grid):
        self.train_and_evaluate(self.model, param_grid)

# 使用该类
df = pd.read_excel('data.xlsx')
bins = [0, 50, 100, 150, 200,300,500]
labels = ['0','1','2','3','4','5']
df['aqi_rank'] = pd.cut(df['aqi'], bins=bins, labels=labels, right=True)
df['pbtime'] = pd.to_datetime(df['pubtime'])
df.set_index('pubtime', inplace=True)

features = ['so2_24h', 'no2_24h', 'co_24h', 'o3_24h', 'pm2_5_24h', 'pm10_24h']
target = 'aqi_rank'


# 逻辑回归
log_reg_model = LogisticRegressionModel(df, features, target)
log_reg_param_grid = [
    {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l2'], 'solver': ['liblinear']},
    {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l2'], 'solver': ['lbfgs']},
    {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['elasticnet'], 'solver': ['saga'], 'l1_ratio': [0.5]}
]
log_reg_model.tune_and_train(log_reg_param_grid)

# KNN
knn_model = KNNModel(df, features, target)
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}
knn_model.tune_and_train(knn_param_grid)



# XGBoost
xgb_model = XGBoostModel(df, features, target)
xgb_param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
catch_xgboost_warnings()
xgb_model.tune_and_train(xgb_param_grid)


Best parameters: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87        30
           1       0.86      0.89      0.87        81
           2       0.87      0.80      0.84        41
           3       0.89      0.89      0.89        27
           4       0.90      0.95      0.93        20
           5       1.00      0.89      0.94         9

    accuracy                           0.88       208
   macro avg       0.90      0.88      0.89       208
weighted avg       0.88      0.88      0.87       208

Accuracy: 0.875
Best parameters: {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'distance'}
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.90      0.77        30
           1       0.80      0.83      0.81        81
           2       0.84      0.63      0.72        41
           3       0.69      0.