In [1]:
# 导入必要的库
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from imblearn.over_sampling import SMOTE

# 读取处理后的数据
data = pd.read_csv('iBite_table_processed.csv')

# 提取空间数据 (仅使用经纬度)
spatial_features = ['latitude', 'longitude']

# 提取体型数据 (头部和身体的尺寸数据)
morphological_features = ['head.w', 'head.h', 'head.l', 'th.w', 'body.l', 'wing.l', 'head_w_body_l_ratio', 'head_volume', 'wing_body_ratio']

# 提取目标变量
y = data['iBite']

# 标准化特征
scaler = StandardScaler()

# 仅使用空间数据
X_spatial = scaler.fit_transform(data[spatial_features])

# 结合空间数据和体型数据
X_combined = scaler.fit_transform(data[spatial_features + morphological_features])

# 使用 SMOTE 平衡数据
sm = SMOTE(random_state=42)
X_spatial_resampled, y_spatial_resampled = sm.fit_resample(X_spatial, y)
X_combined_resampled, y_combined_resampled = sm.fit_resample(X_combined, y)

# 定义模型
models = {
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42),
    'LightGBM': lgb.LGBMRegressor(random_state=42),
    'CatBoost': cb.CatBoostRegressor(random_state=42, verbose=0),
    'SVR': SVR(),
    'GPR': GaussianProcessRegressor(kernel=C(1.0) * RBF(length_scale=1.0))
}

# 定义模型参数
params = {
    'RandomForest': {'n_estimators': [100, 200], 'max_depth': [10, 20], 'max_features': ['sqrt', 'log2']},
    'XGBoost': {'n_estimators': [100, 200], 'max_depth': [3, 5], 'learning_rate': [0.1, 0.05], 'subsample': [0.8, 1.0]},
    'LightGBM': {'n_estimators': [100, 200], 'max_depth': [10, 20], 'learning_rate': [0.1, 0.05], 'num_leaves': [31, 50]},
    'CatBoost': {'iterations': [100, 200], 'depth': [6, 8], 'learning_rate': [0.1, 0.05]},
    'SVR': {'kernel': ['rbf'], 'C': [1, 10], 'gamma': ['scale', 'auto']},
    'GPR': {'alpha': [1e-10, 1e-5]}
}

# 函数：模型训练和评估
def train_and_evaluate(X_train, y_train, X_test, y_test, model_name, param_grid):
    model = models[model_name]
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # 最佳模型
    best_model = grid_search.best_estimator_
    print(f"最佳参数 ({model_name}): {grid_search.best_params_}")
    
    # 在测试集上评估
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} 测试集 MSE: {mse:.2f}, R²: {r2:.2f}")
    
    return mse, r2

# 空间数据 vs 体型+空间数据的对比
def compare_models(X_resampled, y_resampled, label):
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

    results = []
    for model_name in models.keys():
        print(f"训练和评估模型 ({label}): {model_name}")
        mse, r2 = train_and_evaluate(X_train, y_train, X_test, y_test, model_name, params[model_name])
        results.append({'Model': model_name, 'MSE': mse, 'R²': r2})
    
    return pd.DataFrame(results)

# 使用仅空间数据进行训练
print("\n--- 仅空间数据 ---")
spatial_results = compare_models(X_spatial_resampled, y_spatial_resampled, "仅空间数据")

# 使用体型+空间数据进行训练
print("\n--- 空间+体型数据 ---")
combined_results = compare_models(X_combined_resampled, y_combined_resampled, "体型+空间数据")

# 比较结果
print("\n--- 结果对比 ---")
print("仅使用空间数据的结果：")
print(spatial_results)
print("结合体型和空间数据的结果：")
print(combined_results)



--- 仅空间数据 ---
训练和评估模型 (仅空间数据): RandomForest
最佳参数 (RandomForest): {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 200}
RandomForest 测试集 MSE: 246427.35, R²: 0.50
训练和评估模型 (仅空间数据): XGBoost
最佳参数 (XGBoost): {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}
XGBoost 测试集 MSE: 242952.43, R²: 0.50
训练和评估模型 (仅空间数据): LightGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 722, number of used features: 2
[LightGBM] [Info] Start training from score 1367.671745
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 722, number of used features: 2
[LightGBM] [Info] Start training from score 1367.671745
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] 

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


最佳参数 (GPR): {'alpha': 1e-05}
GPR 测试集 MSE: 282386.47, R²: 0.42

--- 空间+体型数据 ---
训练和评估模型 (体型+空间数据): RandomForest
最佳参数 (RandomForest): {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 200}
RandomForest 测试集 MSE: 272923.50, R²: 0.44
训练和评估模型 (体型+空间数据): XGBoost
最佳参数 (XGBoost): {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}
XGBoost 测试集 MSE: 275447.42, R²: 0.44
训练和评估模型 (体型+空间数据): LightGBM

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000701 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 153
[LightGBM] [Info] Number of data points in the train set: 723, number of used features: 2
[LightGBM] [Info] Start training from score 1364.551867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000867 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 162
[LightGBM] [Info] Number of data points

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)





ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002234 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2297
[LightGBM] [Info] Number of data points in the train set: 723, number of used features: 11
[LightGBM] [Info] Start training from score 1364.551867


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)



最佳参数 (GPR): {'alpha': 1e-10}
GPR 测试集 MSE: 825071.84, R²: -0.68

--- 结果对比 ---
仅使用空间数据的结果：
          Model            MSE        R²
0  RandomForest  246427.353917  0.497702
1       XGBoost  242952.432309  0.504785
2      LightGBM  249560.682087  0.491316
3      CatBoost  239630.114150  0.511557
4           SVR  478021.202737  0.025640
5           GPR  282386.474265  0.424406
结合体型和空间数据的结果：
          Model            MSE        R²
0  RandomForest  272923.502188  0.443695
1       XGBoost  275447.420277  0.438550
2      LightGBM  246750.536945  0.497044
3      CatBoost  271040.066846  0.447534
4           SVR  475027.007978  0.031743
5           GPR  825071.843520 -0.681760
