In [1]:
import numpy as np
import keras
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer

max_words = 1000
batch_size = 32
epochs = 5

print('Loading data...')
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words,
                                                         test_split=0.2)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('check data')
print(x_train[:2])
print(y_train[:2])

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('check data')
print(x_train[:2])

# no need for random forest
# print('Convert class vector to binary class matrix '
#       '(for use with categorical_crossentropy)')
# y_train = keras.utils.to_categorical(y_train, num_classes)
# y_test = keras.utils.to_categorical(y_test, num_classes)
# print('y_train shape:', y_train.shape)
# print('y_test shape:', y_test.shape)
# print(y_train[:2])

Using TensorFlow backend.


Loading data...
8982 train sequences
2246 test sequences
check data
[ [1, 2, 2, 8, 43, 10, 447, 5, 25, 207, 270, 5, 2, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 2, 2, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 2, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12]
 [1, 2, 699, 2, 2, 56, 2, 2, 9, 56, 2, 2, 81, 5, 2, 57, 366, 737, 132, 20, 2, 7, 2, 49, 2, 2, 2, 2, 699, 2, 8, 7, 10, 241, 16, 855, 129, 231, 783, 5, 4, 587, 2, 2, 2, 775, 7, 48, 34, 191, 44, 35, 2, 505, 17, 12]]
[3 4]
46 classes
Vectorizing sequence data...
x_train shape: (8982, 1000)
x_test shape: (2246, 1000)
check data
[[ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]]


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
model = RandomForestClassifier(max_features='auto',n_estimators=30)
model.fit(x_train,y_train)
print('fit done')

fit done


In [3]:
print('train')
pred = model.predict(x_train)
print(metrics.classification_report(y_train,pred))

train
             precision    recall  f1-score   support

          0       0.90      0.96      0.93        55
          1       0.97      0.97      0.97       432
          2       0.99      0.99      0.99        74
          3       0.97      0.98      0.98      3159
          4       0.97      0.97      0.97      1949
          5       1.00      0.94      0.97        17
          6       1.00      1.00      1.00        48
          7       0.94      1.00      0.97        16
          8       0.93      0.93      0.93       139
          9       0.95      0.97      0.96       101
         10       0.97      0.94      0.95       124
         11       0.97      0.96      0.97       390
         12       0.98      0.96      0.97        49
         13       0.97      0.97      0.97       172
         14       0.96      0.88      0.92        26
         15       1.00      1.00      1.00        20
         16       0.96      0.95      0.96       444
         17       1.00      0.97      0

In [4]:
print('test')
pred = model.predict(x_test)
print(metrics.classification_report(y_test,pred))

test
             precision    recall  f1-score   support

          0       0.64      0.75      0.69        12
          1       0.57      0.80      0.67       105
          2       0.75      0.30      0.43        20
          3       0.90      0.92      0.91       813
          4       0.73      0.89      0.80       474
          5       0.00      0.00      0.00         5
          6       1.00      0.71      0.83        14
          7       1.00      0.33      0.50         3
          8       0.69      0.76      0.72        38
          9       0.77      0.68      0.72        25
         10       0.87      0.90      0.89        30
         11       0.57      0.77      0.65        83
         12       0.60      0.23      0.33        13
         13       0.51      0.51      0.51        37
         14       0.00      0.00      0.00         2
         15       1.00      0.11      0.20         9
         16       0.64      0.73      0.68        99
         17       0.00      0.00      0.

  'precision', 'predicted', average, warn_for)


In [8]:
# test xgb
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=30, subsample=0.8)
model.fit(x_train,y_train)
print('fit done')

fit done


In [9]:
print('train')
pred = model.predict(x_train)
print(metrics.classification_report(y_train,pred))

train
             precision    recall  f1-score   support

          0       0.92      0.89      0.91        55
          1       0.81      0.84      0.82       432
          2       0.76      0.59      0.67        74
          3       0.93      0.92      0.92      3159
          4       0.78      0.92      0.84      1949
          5       1.00      0.18      0.30        17
          6       0.96      0.94      0.95        48
          7       0.92      0.75      0.83        16
          8       0.90      0.61      0.73       139
          9       0.90      0.97      0.93       101
         10       0.88      0.92      0.90       124
         11       0.76      0.85      0.80       390
         12       0.84      0.76      0.80        49
         13       0.77      0.63      0.69       172
         14       0.78      0.96      0.86        26
         15       1.00      0.60      0.75        20
         16       0.76      0.80      0.78       444
         17       0.93      0.67      0

In [10]:
print('test')
pred = model.predict(x_test)
print(metrics.classification_report(y_test,pred))

test
             precision    recall  f1-score   support

          0       0.90      0.75      0.82        12
          1       0.69      0.74      0.72       105
          2       0.69      0.55      0.61        20
          3       0.92      0.91      0.92       813
          4       0.71      0.88      0.78       474
          5       0.00      0.00      0.00         5
          6       0.93      0.93      0.93        14
          7       0.67      0.67      0.67         3
          8       0.69      0.66      0.68        38
          9       0.77      0.92      0.84        25
         10       0.91      0.97      0.94        30
         11       0.61      0.65      0.63        83
         12       1.00      0.62      0.76        13
         13       0.54      0.54      0.54        37
         14       0.25      0.50      0.33         2
         15       0.67      0.44      0.53         9
         16       0.65      0.82      0.72        99
         17       0.62      0.42      0.

  'precision', 'predicted', average, warn_for)
