In [1]:
from sklearn.datasets import fetch_20newsgroups
from keras.layers import Dropout, Dense
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def TFIDF(X_train, X_test,MAX_NB_WORDS=75000):
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()
    print("tf-idf with",str(np.array(X_train).shape[1]),"features")
    print(X_train, X_test)
    return (X_train,X_test)

In [3]:
def Build_Model_DNN_Text(shape, nClasses, dropout=0.5):
    """
    buildModel_DNN_Tex(shape, nClasses,dropout)
    Build Deep neural networks Model for text classification
    Shape is input feature space
    nClasses is number of classes
    """
    model = Sequential()
    node = 512 # number of nodes
    nLayers = 4 # number of  hidden layer
    
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model

### Load text dataset (20newsgroups)

In [4]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

### run DNN and see result

In [5]:
X_train_tfidf,X_test_tfidf = TFIDF(X_train,X_test)

tf-idf with 75000 features
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [6]:
model_DNN = Build_Model_DNN_Text(X_train_tfidf.shape[1], 20)
model_DNN.fit(X_train_tfidf, y_train,
                              validation_data=(X_test_tfidf, y_test),
                              epochs=10,
                              batch_size=128,
                              verbose=2)

Train on 11314 samples, validate on 7532 samples
Epoch 1/10
 - 9s - loss: 2.8053 - acc: 0.0871 - val_loss: 2.3260 - val_acc: 0.2041
Epoch 2/10
 - 8s - loss: 1.7715 - acc: 0.3522 - val_loss: 1.1753 - val_acc: 0.6032
Epoch 3/10
 - 8s - loss: 0.8005 - acc: 0.7057 - val_loss: 0.8167 - val_acc: 0.7681
Epoch 4/10
 - 8s - loss: 0.3260 - acc: 0.8925 - val_loss: 0.8408 - val_acc: 0.7801
Epoch 5/10
 - 8s - loss: 0.1737 - acc: 0.9470 - val_loss: 0.8553 - val_acc: 0.8028
Epoch 6/10
 - 8s - loss: 0.0984 - acc: 0.9721 - val_loss: 0.9327 - val_acc: 0.8031
Epoch 7/10
 - 8s - loss: 0.0801 - acc: 0.9775 - val_loss: 0.9272 - val_acc: 0.8014
Epoch 8/10
 - 8s - loss: 0.0705 - acc: 0.9814 - val_loss: 0.9457 - val_acc: 0.8039
Epoch 9/10
 - 8s - loss: 0.0530 - acc: 0.9848 - val_loss: 0.9696 - val_acc: 0.8059
Epoch 10/10
 - 8s - loss: 0.0413 - acc: 0.9899 - val_loss: 0.9762 - val_acc: 0.8056


<keras.callbacks.History at 0x7fca83660400>

In [7]:
predicted = model_DNN.predict_classes(X_test_tfidf)
print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.84      0.71      0.77       319
          1       0.69      0.72      0.71       389
          2       0.81      0.60      0.69       394
          3       0.64      0.74      0.69       392
          4       0.64      0.81      0.72       385
          5       0.76      0.77      0.77       395
          6       0.79      0.88      0.83       390
          7       0.88      0.86      0.87       396
          8       0.91      0.95      0.93       398
          9       0.92      0.95      0.93       397
         10       0.97      0.94      0.96       399
         11       0.96      0.89      0.93       396
         12       0.70      0.69      0.69       393
         13       0.93      0.75      0.83       396
         14       0.87      0.89      0.88       394
         15       0.88      0.87      0.88       398
         16       0.73      0.91      0.81       364
         17       0.97      0.80      0.88   

### Recurrent Neural Networks (RNN)
### Gated Recurrent Unit (GRU)
### Long Short-Term Memory (LSTM)

### convert text to word embedding (Using GloVe):

### Build a RNN Model for Text:

### Convolutional Neural Networks (CNN)