In [2]:
from google.colab import files
uloaded = files.upload()


Saving nigeria_data.csv to nigeria_data.csv


In [3]:
import pandas as pd
df = pd.read_csv('nigeria_data.csv')
df.head(10)

Unnamed: 0,iu_code,year,total_population,endemicity,latitude,longitude,jan_precip,feb_precip,mar_precip,apr_precip,...,oct_temp,oct_max_temp,nov_temp,nov_max_temp,dec_temp,dec_max_temp,ann_avg_temp,ann_max_temp,previous_five_year_outbreak,outbreak_indicator
0,NGA0377536337,2014,0.310159,"[1, 0]",5.333333,7.316667,0.555084,0.614201,0.557491,0.603998,...,0.440952,0.241636,0.5379,0.355322,0.75222,0.356712,0.485207,0.16129,0.0,100.0
1,NGA0377536337,2015,0.319468,"[1, 0]",5.333333,7.316667,0.200301,0.630722,0.679992,0.52697,...,0.448571,0.257745,0.538813,0.317841,0.545601,0.221719,0.453254,0.152161,1.0,100.0
2,NGA0377536337,2016,0.32585,"[1, 0]",5.333333,7.316667,0.379128,0.392994,0.713262,0.598957,...,0.466667,0.273854,0.578995,0.282609,0.719128,0.358974,0.501775,0.240414,1.0,100.0
3,NGA0377536337,2017,0.332236,"[1, 0]",5.333333,7.316667,0.453054,0.320473,0.675225,0.661074,...,0.46381,0.267038,0.526941,0.298351,0.74092,0.319005,0.493491,0.168594,1.0,28.0
4,NGA0377536337,2018,0.338619,"[1, 0]",5.333333,7.316667,0.200301,0.800813,0.63644,0.556822,...,0.464762,0.26456,0.578995,0.351574,0.719128,0.353695,0.476923,0.202678,1.0,30.0
5,NGA0377536337,2019,0.334447,"[1, 0]",5.333333,7.316667,0.607535,0.535815,0.548895,0.427738,...,0.407619,0.218711,0.569863,0.307346,0.706215,0.351433,0.499408,0.142422,1.0,31.0
6,NGA0377536337,2020,0.340339,"[1, 0]",5.333333,7.316667,0.0,0.247952,0.732311,0.624532,...,0.440952,0.265799,0.565297,0.32009,0.766747,0.352187,0.492308,0.178332,1.0,100.0
7,NGA0377536337,2021,0.346948,"[1, 0]",5.333333,7.316667,0.607535,0.348044,0.602391,0.681648,...,0.470476,0.30855,0.540639,0.266867,0.682002,0.351433,0.494675,0.148509,1.0,30.0
8,NGA0377536337,2022,0.353602,"[1, 0]",5.333333,7.316667,0.803443,0.247952,0.582878,0.665766,...,0.482857,0.293061,0.548858,0.311844,0.686037,0.33635,0.469822,0.269629,1.0,33.0
9,NGA0377536337,2023,0.359985,"[0, 1]",5.333333,7.316667,0.354783,0.552862,0.665183,0.586844,...,0.512381,0.325279,0.568037,0.305097,0.76594,0.424585,0.52071,0.192331,1.0,0.0


In [5]:
# 1. set features
climate_precip = [f"{m}_precip" for m in ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']]
climate_temp = [f"{m}_temp" for m in ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']]
climate_max_temp = [f"{m}_max_temp" for m in ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']]
yearly_climate = ['ann_avg_temp', 'ann_max_temp']
other_features = ['latitude', 'longitude', 'total_population', 'previous_five_year_outbreak']

# 2. merge all features
features = climate_precip + climate_temp + climate_max_temp + yearly_climate + other_features

# 3. create x and y（set outbreak_indicator >= 50 as“outbreak”）
X = df[features]
y = (df['outbreak_indicator'] >= 50).astype(int)  # 变成0/1标签


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [11]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=100,
    scale_pos_weight=1218/330,
    eval_metric='logloss'  # 避免warning
)

model.fit(X_train, y_train)


In [12]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[1103  115]
 [  93  237]]
              precision    recall  f1-score   support

           0       0.92      0.91      0.91      1218
           1       0.67      0.72      0.70       330

    accuracy                           0.87      1548
   macro avg       0.80      0.81      0.80      1548
weighted avg       0.87      0.87      0.87      1548



In [13]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier


In [14]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1],
    'scale_pos_weight': [1218/330, 4.0, 6.0]  # 尝试不同类别权重
}


In [15]:
xgb = XGBClassifier(eval_metric='logloss')  # 避免 warning

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='f1',           # 针对爆发类不平衡问题，用 F1-score 优化
    cv=5,                   # 5折交叉验证
    n_jobs=-1,              # 并行加速
    verbose=1
)

grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [16]:
# 用最佳模型预测
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))


[[1082  136]
 [  73  257]]
              precision    recall  f1-score   support

           0       0.94      0.89      0.91      1218
           1       0.65      0.78      0.71       330

    accuracy                           0.86      1548
   macro avg       0.80      0.83      0.81      1548
weighted avg       0.88      0.86      0.87      1548

