In [21]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report


In [2]:
def datatrans(data, T):
    n,p = data.shape
    n_sample = n-T
    newdata = np.zeros((n_sample,T,p))
    target = np.zeros((n_sample,))
    for i in range(n_sample):
        newdata[i] = data.iloc[i:i+T]
        target[i] = data.iloc[i+T,12]
    return newdata, target

In [3]:
def value2class(y):
    y[np.where((y>0)&(y<=1))]=1
    y[np.where((y>1)&(y<=4))]=2
    y[np.where(y>4)]=3
    #y[np.where(y>16)]=4
    return y.astype('int')

In [23]:
def load_data(path, T):
    data = pd.read_csv(path)
    data = data.drop(columns=['Unnamed: 0'])

    N = data.shape[0]
    n_train = round(N * 0.7)
    n_valid = round(N * 0.2)
    n_test = N - n_train - n_valid

    train = data.iloc[:n_train]
    valid = data.iloc[n_train:n_train + n_valid]
    test = data.iloc[n_train + n_valid:]

    X_train, y_train = datatrans(train, T)
    X_valid, y_valid = datatrans(valid, T)
    X_test, y_test = datatrans(test, T)

    # 展开
    X_train = X_train.reshape(X_train.shape[0], -1)
    X_valid = X_valid.reshape(X_valid.shape[0],-1)
    X_test = X_test.reshape(X_test.shape[0], -1)

    # 分类
    y_train = value2class(y_train)
    y_valid = value2class(y_valid)
    y_test = value2class(y_test)

    # normalization
    sr_X = StandardScaler()
    sr_X = sr_X.fit(X_train)
    X_train = sr_X.transform(X_train)
    X_valid = sr_X.transform(X_valid)
    X_test = sr_X.transform(X_test)

    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [38]:
def xgboost_classifier_train(params, X_train, y_train):
    clf = xgb.XGBClassifier(params = params)
    clf.fit(X_train, y_train)
    return clf

def xgboost_classifier_predict(clf, X, y):
    y_pre = clf.predict(X)
    print("正确率：{}".format(accuracy_score(y, y_pre)))
    print("分类结果报告：\n", classification_report(y,y_pre))

In [40]:
def main_xgboost(path):
    T = 6
    X_train, y_train, X_valid, y_valid, X_test, y_test = load_data(path, T)
    
    params = {'max_depth':5, 
         'learning_rate':0.01,
         'n_estimators':500,
         'booster':'gbtree', 
         'nthread':-1,
         'gamma':0.1,
         'subsample':0.8,
         'colsample_bytree':0.7,
         'colsample_bylevel':1,
         'silent':False, 
         'reg_alpha':0,
         'reg_lambda':1,
         'min_child_weight':1,
         'scale_pos_weight':1,
         'objective':'multi:softmax',
         'num_class':5}
    xgb_clf = xgboost_classifier_train(params, X_train, y_train)
    
    print('\n/*******训练集实验结果**********/')
    xgboost_classifier_predict(xgb_clf, X_train, y_train)
    
    print('\n/*******验证集实验结果**********/')
    xgboost_classifier_predict(xgb_clf, X_valid, y_valid)
    
    print('\n/*******测试集实验结果**********/')
    xgboost_classifier_predict(xgb_clf, X_test, y_test)
   
path = 'hourly-weather-surface/station_385_small.csv'
main_xgboost(path)

Parameters: { params } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



/*******训练集实验结果**********/
正确率：0.9798994974874372
分类结果报告：
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     23020
           1       0.99      0.78      0.88      2335
           2       1.00      0.97      0.99       629
           3       1.00      1.00      1.00       284

    accuracy                           0.98     26268
   macro avg       0.99      0.94      0.96     26268
weighted avg       0.98      0.98      0.98     26268


/*******验证集实验结果**********/
正确率：0.8861485135315291
分类结果报告：
               precision    recall  f1-score   support

           0       0.91      0.99      0.95      6468
           1       0.55      0.31      0.39       749
