In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
def datatrans(data, T):
    n, p = data.shape
    n_sample = n - T
    newdata = np.zeros((n_sample, T, p))
    target = np.zeros((n_sample,))
    for i in range(n_sample):
        newdata[i] = data.iloc[i:i + T]
        target[i] = data.iloc[i + T, 12]
    return newdata, target

In [3]:
def value2class(y):
    y[np.where((y > 0) & (y <= 1))] = 1
    y[np.where((y > 1) & (y <= 4))] = 2
    y[np.where(y > 4)] = 3
    #y[np.where((y > 4) & (y <= 16))] = 3
    return y.astype('int')

In [4]:
def load_data(path):
    data = pd.read_csv(path)
    # data = data.drop('Unnamed: 0')

    N = data.shape[0]
    n_train = round(N * 0.7)
    n_valid = round(N * 0.2)
    n_test = N - n_train - n_valid

    train = data.iloc[:n_train]
    valid = data.iloc[n_train:n_train + n_valid]
    test = data.iloc[n_train + n_valid:]

    X_train, y_train = datatrans(train, T=6)
    X_valid, y_valid = datatrans(valid, T=6)
    X_test, y_test = datatrans(test, T=6)

    # 展开
    X_train = X_train.reshape(X_train.shape[0], -1)
    X_valid = X_valid.reshape(X_valid.shape[0],-1)
    X_test = X_test.reshape(X_test.shape[0], -1)

    # 分类
    y_train = value2class(y_train)
    y_valid = value2class(y_valid)
    y_test = value2class(y_test)

    # normalization
    sr_X = StandardScaler()
    X_train = sr_X.fit_transform(X_train)
    X_valid = sr_X.fit_transform(X_valid)
    X_test = sr_X.fit_transform(X_test)

    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [5]:
X_train, y_train, X_valid, y_valid, X_test, y_test = load_data('/Users/melody618/Desktop/大作业/降水量预测/code/station_385_small_a.csv')


### Logistic Classifier 

In [6]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_result = lr.predict(X_test)
    
print("The train score:", lr.score(X_train, y_train))
print("The valid score:", lr.score(X_valid, y_valid))
print("The test score:", lr.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 0.885107354957
The valid score: 0.868550859885
The test score: 0.922892209178
             precision    recall  f1-score   support

          0       0.94      0.99      0.96      3435
          1       0.51      0.16      0.24       243
          2       0.42      0.10      0.16        51
          3       0.11      0.05      0.07        19

avg / total       0.90      0.92      0.90      3748



In [8]:
lr = LogisticRegression(max_iter=1000, class_weight='balanced') # classweight 按照样本数比例生成权重，解决样本不均衡问题
lr.fit(X_train, y_train)
y_result = lr.predict(X_test)
    
print("The train score:", lr.score(X_train, y_train))
print("The valid score:", lr.score(X_valid, y_valid))
print("The test score:", lr.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 0.867100654789
The valid score: 0.854552726303
The test score: 0.882337246531
             precision    recall  f1-score   support

          0       0.96      0.93      0.95      3435
          1       0.36      0.32      0.34       243
          2       0.19      0.37      0.25        51
          3       0.06      0.26      0.10        19

avg / total       0.90      0.88      0.89      3748



### SGD

In [9]:
sgdc = SGDClassifier()
sgdc.fit(X_train, y_train)
y_result = sgdc.predict(X_test)
    
print("The train score:", sgdc.score(X_train, y_train))
print("The valid score:", sgdc.score(X_valid, y_valid))
print("The test score:", sgdc.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 0.876351454241
The valid score: 0.855885881882
The test score: 0.887673425827
             precision    recall  f1-score   support

          0       0.94      0.95      0.95      3435
          1       0.23      0.13      0.17       243
          2       0.13      0.20      0.16        51
          3       0.11      0.26      0.16        19

avg / total       0.88      0.89      0.88      3748





In [10]:
sgdc = SGDClassifier(class_weight='balanced')
sgdc.fit(X_train, y_train)
y_result = sgdc.predict(X_test)
    
print("The train score:", sgdc.score(X_train, y_train))
print("The valid score:", sgdc.score(X_valid, y_valid))
print("The test score:", sgdc.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 0.719430485762
The valid score: 0.714171443807
The test score: 0.714247598719
             precision    recall  f1-score   support

          0       0.95      0.76      0.84      3435
          1       0.07      0.19      0.10       243
          2       0.10      0.39      0.15        51
          3       0.10      0.53      0.17        19

avg / total       0.88      0.71      0.78      3748





### SVM

In [11]:
svc = SVC()
svc.fit(X_train, y_train)
y_result = svc.predict(X_test)
    
print("The train score:", svc.score(X_train, y_train))
print("The valid score:", svc.score(X_valid, y_valid))
print("The test score:", svc.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 0.901477082382
The valid score: 0.871617117718
The test score: 0.926627534685
             precision    recall  f1-score   support

          0       0.94      0.99      0.96      3435
          1       0.57      0.22      0.32       243
          2       0.58      0.14      0.22        51
          3       0.29      0.11      0.15        19

avg / total       0.91      0.93      0.91      3748



In [18]:
svc = SVC(class_weight='balanced')
svc.fit(X_train, y_train)
y_result = svc.predict(X_test)
    
print("The train score:", svc.score(X_train, y_train))
print("The valid score:", svc.score(X_valid, y_valid))
print("The test score:", svc.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 0.800137048881
The valid score: 0.760165311292
The test score: 0.716115261473
             precision    recall  f1-score   support

          0       0.98      0.73      0.84      3435
          1       0.17      0.63      0.27       243
          2       0.09      0.35      0.15        51
          3       0.03      0.11      0.04        19

avg / total       0.91      0.72      0.79      3748



In [13]:
svc = SVC(C=10)
svc.fit(X_train, y_train)
y_result = svc.predict(X_test)
    
print("The train score:", svc.score(X_train, y_train))
print("The valid score:", svc.score(X_valid, y_valid))
print("The test score:", svc.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 0.938480280189
The valid score: 0.873483535529
The test score: 0.919423692636
             precision    recall  f1-score   support

          0       0.94      0.98      0.96      3435
          1       0.45      0.24      0.32       243
          2       0.34      0.22      0.27        51
          3       0.25      0.05      0.09        19

avg / total       0.90      0.92      0.91      3748



In [14]:
svc = SVC(C=10,class_weight='balanced')
svc.fit(X_train, y_train)
y_result = svc.predict(X_test)
    
print("The train score:", svc.score(X_train, y_train))
print("The valid score:", svc.score(X_valid, y_valid))
print("The test score:", svc.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 0.88750571037
The valid score: 0.797626983069
The test score: 0.783351120598
             precision    recall  f1-score   support

          0       0.96      0.81      0.88      3435
          1       0.19      0.58      0.29       243
          2       0.12      0.22      0.15        51
          3       0.00      0.00      0.00        19

avg / total       0.90      0.78      0.83      3748



### KNN

In [15]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_result = knn.predict(X_test)

print("The train score:", knn.score(X_train, y_train))
print("The valid score:", knn.score(X_valid, y_valid))
print("The test score:", knn.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 0.906654484544
The valid score: 0.856419144114
The test score: 0.905016008538
             precision    recall  f1-score   support

          0       0.93      0.97      0.95      3435
          1       0.28      0.15      0.20       243
          2       0.27      0.12      0.16        51
          3       0.00      0.00      0.00        19

avg / total       0.88      0.91      0.89      3748



In [16]:
knn = KNeighborsClassifier(n_neighbors=10) # default = 5
knn.fit(X_train, y_train)
y_result = knn.predict(X_test)

print("The train score:", knn.score(X_train, y_train))
print("The valid score:", knn.score(X_valid, y_valid))
print("The test score:", knn.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 0.894320085275
The valid score: 0.865217970937
The test score: 0.913020277481
             precision    recall  f1-score   support

          0       0.93      0.99      0.96      3435
          1       0.32      0.10      0.16       243
          2       0.17      0.04      0.06        51
          3       0.17      0.05      0.08        19

avg / total       0.88      0.91      0.89      3748



### DT

In [19]:
dtc = DecisionTreeClassifier(max_depth=10)
dtc.fit(X_train, y_train)
y_result = dtc.predict(X_test)
    
print("The train score:", dtc.score(X_train, y_train))
print("The valid score:", dtc.score(X_valid, y_valid))
print("The test score:", dtc.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 0.927820922796
The valid score: 0.876416477803
The test score: 0.900480256137
             precision    recall  f1-score   support

          0       0.95      0.96      0.95      3435
          1       0.33      0.32      0.32       243
          2       0.36      0.31      0.33        51
          3       0.00      0.00      0.00        19

avg / total       0.90      0.90      0.90      3748



In [20]:
dtc = DecisionTreeClassifier(max_depth=10, class_weight='balanced')
dtc.fit(X_train, y_train)
y_result = dtc.predict(X_test)
    
print("The train score:", dtc.score(X_train, y_train))
print("The valid score:", dtc.score(X_valid, y_valid))
print("The test score:", dtc.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 0.750380691335
The valid score: 0.704306092521
The test score: 0.726520811099
             precision    recall  f1-score   support

          0       0.97      0.75      0.85      3435
          1       0.18      0.53      0.27       243
          2       0.06      0.35      0.10        51
          3       0.04      0.21      0.06        19

avg / total       0.91      0.73      0.80      3748



In [21]:
dtc = DecisionTreeClassifier(class_weight='balanced')
dtc.fit(X_train, y_train)
y_result = dtc.predict(X_test)
    
print("The train score:", dtc.score(X_train, y_train))
print("The valid score:", dtc.score(X_valid, y_valid))
print("The test score:", dtc.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 1.0
The valid score: 0.826023196907
The test score: 0.853521878335
             precision    recall  f1-score   support

          0       0.95      0.91      0.93      3435
          1       0.20      0.27      0.23       243
          2       0.11      0.20      0.14        51
          3       0.03      0.05      0.03        19

avg / total       0.89      0.85      0.87      3748



### MLP

In [22]:
mlp = MLPClassifier(hidden_layer_sizes=(10,5), max_iter=500)
mlp.fit(X_train, y_train)
y_result = mlp.predict(X_test)
    
print("The train score:", mlp.score(X_train, y_train))
print("The valid score:", mlp.score(X_valid, y_valid))
print("The test score:", mlp.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 0.897099132024
The valid score: 0.885215304626
The test score: 0.921824973319
             precision    recall  f1-score   support

          0       0.95      0.98      0.97      3435
          1       0.49      0.26      0.34       243
          2       0.29      0.35      0.32        51
          3       0.15      0.11      0.12        19

avg / total       0.91      0.92      0.91      3748



In [23]:
mlp = MLPClassifier(hidden_layer_sizes=(10,20,5), max_iter=500)
mlp.fit(X_train, y_train)
y_result = mlp.predict(X_test)
    
print("The train score:", mlp.score(X_train, y_train))
print("The valid score:", mlp.score(X_valid, y_valid))
print("The test score:", mlp.score(X_test, y_test))
print(classification_report(y_test, y_result))

The train score: 0.898050860362
The valid score: 0.882948940141
The test score: 0.918623265742
             precision    recall  f1-score   support

          0       0.95      0.98      0.96      3435
          1       0.43      0.28      0.34       243
          2       0.31      0.37      0.34        51
          3       0.00      0.00      0.00        19

avg / total       0.90      0.92      0.91      3748



  'precision', 'predicted', average, warn_for)
