In [8]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report


In [9]:
def datatrans(data, T):
    n,p = data.shape
    n_sample = n-T
    newdata = np.zeros((n_sample,T,p))
    target = np.zeros((n_sample,))
    for i in range(n_sample):
        newdata[i] = data.iloc[i:i+T]
        target[i] = data.iloc[i+T,12]
    return newdata, target

In [10]:
def value2class(y):
    y[np.where((y>0)&(y<=1))]=1
    y[np.where((y>1)&(y<=4))]=2
    y[np.where(y>4)]=3
    #y[np.where(y>16)]=4
    return y.astype('int')

In [11]:
def load_data(path, T):
    data = pd.read_csv(path)
    data = data.drop(columns=['Unnamed: 0'])

    N = data.shape[0]
    n_train = round(N * 0.7)
    n_valid = round(N * 0.2)
    n_test = N - n_train - n_valid

    train = data.iloc[:n_train]
    valid = data.iloc[n_train:n_train + n_valid]
    test = data.iloc[n_train + n_valid:]

    X_train, y_train = datatrans(train, T)
    X_valid, y_valid = datatrans(valid, T)
    X_test, y_test = datatrans(test, T)

     # 分类
    y_train = value2class(y_train)
    y_valid = value2class(y_valid)
    y_test = value2class(y_test)
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test


def preprocess_xgboost(path, T):
    X_train, y_train, X_valid, y_valid, X_test, y_test = load_data(path, T)
    
    #展开
    X_train = X_train.reshape(X_train.shape[0], -1)
    X_valid = X_valid.reshape(X_valid.shape[0],-1)
    X_test = X_test.reshape(X_test.shape[0], -1)

    # normalization
    sr_X = StandardScaler()
    sr_X = sr_X.fit(X_train)
    X_train = sr_X.transform(X_train)
    X_valid = sr_X.transform(X_valid)
    X_test = sr_X.transform(X_test)

    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [12]:
def xgboost_classifier_train(params, X_train, y_train):
    clf = xgb.XGBClassifier(params = params)
    clf.fit(X_train, y_train)
    return clf

def xgboost_classifier_predict(clf, X, y):
    y_pre = clf.predict(X)
    print("正确率：{}".format(accuracy_score(y, y_pre)))
    print("分类结果报告：\n", classification_report(y,y_pre))

In [13]:
def main_xgboost(path):
    T = 6
    X_train, y_train, X_valid, y_valid, X_test, y_test = preprocess_xgboost(path, T)
    
    params = {'max_depth':5, 
         'learning_rate':0.01,
         'n_estimators':500,
         'booster':'gbtree', 
         'nthread':-1,
         'gamma':0.1,
         'subsample':0.8,
         'colsample_bytree':0.7,
         'colsample_bylevel':1,
         'silent':False, 
         'reg_alpha':0,
         'reg_lambda':1,
         'min_child_weight':1,
         'scale_pos_weight':1,
         'objective':'multi:softmax',
         'num_class':5}
    xgb_clf = xgboost_classifier_train(params, X_train, y_train)
    
    print('\n/*******训练集实验结果**********/')
    xgboost_classifier_predict(xgb_clf, X_train, y_train)
    
    print('\n/*******验证集实验结果**********/')
    xgboost_classifier_predict(xgb_clf, X_valid, y_valid)
    
    print('\n/*******测试集实验结果**********/')
    xgboost_classifier_predict(xgb_clf, X_test, y_test)
   
path = 'hourly-weather-surface/station_385_small.csv'
main_xgboost(path)

Parameters: { params } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



/*******训练集实验结果**********/
正确率：0.9798994974874372
分类结果报告：
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     23020
           1       0.99      0.78      0.88      2335
           2       1.00      0.97      0.99       629
           3       1.00      1.00      1.00       284

    accuracy                           0.98     26268
   macro avg       0.99      0.94      0.96     26268
weighted avg       0.98      0.98      0.98     26268


/*******验证集实验结果**********/
正确率：0.8861485135315291
分类结果报告：
               precision    recall  f1-score   support

           0       0.91      0.99      0.95      6468
           1       0.55      0.31      0.39       749


### LSTM实现时间序列数据的预测

In [19]:
import torch
import torch.nn as nn
from torch import optim
from torch.autograd import Variable
import pandas as pd
import numpy as np



In [74]:
# LSTM网络
class LSTM_Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTM_Classifier, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, num_layers)
        self.f = nn.Sequential(nn.Linear(hidden_size, output_size), 
                               nn.Softmax())
        
    def forward(self, x):
        x = torch.tensor(x, dtype=torch.float32)
        output, _ = self.rnn(x)
        seq_len, batch, hidden_size = output.shape
        output = self.f(output[-1,:,:].view(-1, hidden_size))
        return output 

In [105]:
def lstm_train(X_train, y_train, X_valid, y_valid):
    #数据格式处理
    n_train, t, p = X_train.shape
    n_valid = X_valid.shape[0]
    X_train = X_train.reshape(-1,n_train,p)
    X_valid = X_valid.reshape(-1,n_valid,p)
    
    #转为张量
    X_train = torch.from_numpy(X_train).double()
    y_train = torch.from_numpy(y_train) 
   
    X_valid = torch.from_numpy(X_valid).double()
    y_valid = torch.from_numpy(y_valid) 
    
    model = LSTM_Classifier(input_size = p, hidden_size = 20, output_size = 4, num_layers=2)
    print(model)
    Loss = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1, momentum = 0.5)

    
    for i in range(100):
        model.zero_grad() #梯度清零
        var_X = Variable(X_train)#.type(torch.FloatTensor)
        var_y = Variable(y_train)#.type(torch.FloatTensor)
        out = model(var_X)
        #print(out.shape, var_y.shape)
        loss = Loss(out, var_y.long().squeeze())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1)%50 ==0:
            print('Epoch: {}, Loss: {:.5f}'.format(i+1, loss.item()))
        '''
            var_X_valid = Variable(X_valid)
            var_y_valid = Variable(y_valid)
            pre_y = model(var_X_valid).data.max()
        '''
    return model
            

In [123]:
def lstm_predict(model, X, y):
    n, t, p = X.shape
    X = X.reshape(-1,n,p)
    X = torch.from_numpy(X).double()
    var_X = Variable(X)
    output = model(var_X)
    y_pre = output.data.max(dim=1)[1]
    y_pre = y_pre.numpy()
    print("正确率：{}".format(accuracy_score(y, y_pre)))
    print("分类结果报告：\n", classification_report(y,y_pre))
    return output
    

In [106]:
T = 6
path = 'hourly-weather-surface/station_385_small.csv'
X_train, y_train, X_valid, y_valid, X_test, y_test = load_data(path, T)
print(X_train.shape)
model = lstm_train(X_train, y_train, X_valid, y_valid)

(26268, 6, 13)
LSTM_Classifier(
  (rnn): LSTM(13, 20, num_layers=2)
  (f): Sequential(
    (0): Linear(in_features=20, out_features=4, bias=True)
    (1): Softmax(dim=None)
  )
)


  # Remove the CWD from sys.path while we load stuff.


Epoch: 50, Loss: 0.89964
Epoch: 100, Loss: 0.87700


In [124]:
pre_y = lstm_predict(model, X_valid, y_valid)

  # Remove the CWD from sys.path while we load stuff.


正确率：0.862285028662845
分类结果报告：
               precision    recall  f1-score   support

           0       0.86      1.00      0.93      6468
           1       0.00      0.00      0.00       749
           2       0.00      0.00      0.00       195
           3       0.00      0.00      0.00        89

    accuracy                           0.86      7501
   macro avg       0.22      0.25      0.23      7501
weighted avg       0.74      0.86      0.80      7501



In [109]:
pre_y.shape

torch.Size([7501, 4])

In [115]:
pre_y.data.max(dim=1)

torch.return_types.max(
values=tensor([0.9804, 0.9804, 0.9804,  ..., 0.9804, 0.9804, 0.9811]),
indices=tensor([0, 0, 0,  ..., 0, 0, 0]))

In [116]:
pre_y = pre_y.data.max(dim=1)[1]
pre_y

tensor([0, 0, 0,  ..., 0, 0, 0])

In [118]:
sum(pre_y.numpy() == y_valid)/len(y_valid)

0.862285028662845

In [122]:
print(classification_report(y_valid ,pre_y.numpy()))

              precision    recall  f1-score   support

           0       0.86      1.00      0.93      6468
           1       0.00      0.00      0.00       749
           2       0.00      0.00      0.00       195
           3       0.00      0.00      0.00        89

    accuracy                           0.86      7501
   macro avg       0.22      0.25      0.23      7501
weighted avg       0.74      0.86      0.80      7501

