In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from keras.models import Sequential
from keras.layers import Dense
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers.recurrent import GRU
from keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


[nltk_data] Downloading package punkt to /home/anna/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def prepare_data(filename):
    data = pd.read_csv(filename, sep="\t")
    data = data[['text', 'subj']]
    data['subj'] = data['subj'].apply(lambda subj: subj.split('\\'))
    data = data.sample(n=10000)
    mlb = MultiLabelBinarizer()
    encoded_subjects = pd.DataFrame(mlb.fit_transform(data.pop('subj')), columns=mlb.classes_, index=data.index)
    data = data.join(encoded_subjects)
    return data, mlb.classes_

In [3]:
train, categories = prepare_data('learn.txt')
test, _ = prepare_data('test.txt')
print('Categoreis: {}'.format(categories))
#train.head()

Categoreis: ['00' 'e1' 'e2' 'e3' 'e4' 'e5' 'e7' 'e8' 'e9' 'f1' 'f2' 'f3' 'f4' 'f5'
 'f7' 'f8' 'f9']


In [4]:
subjects = pd.read_csv('subjects.txt', sep="\t", header=None, names=['code', 'desc_rus', 'description'])[['code', 'description']]
subjects.head()

Unnamed: 0,code,description
0,e1,COMPUTERS; ELECTRONICS
1,e2,ASTRONOMY
2,e3,BIOLOGY; MEDICAL SCIENCES
3,e4,GEOGRAPHY; GEOPHYSICS
4,e5,GEOLOGY; EARTH SCIENCES; MINES AND MINING INDU...


In [5]:
X_train = train.text
X_test = test.text
Y_train = train[categories]
Y_test = test[categories]

In [6]:
xLengths = [len(word_tokenize(x)) for x in X_train]
h = sorted(xLengths)  #sorted lengths
maxLength =h[len(h)-1]
print("max input length is: ",maxLength)

max input length is:  612


In [7]:
maxLength = h[int(len(h) * 0.70)]
print("70% cover input sequence length up to",maxLength)

70% cover input sequence length up to 145


In [8]:
max_vocab_size = 200000
input_tokenizer = Tokenizer(max_vocab_size)
input_tokenizer.fit_on_texts(X_train)
input_vocab_size = len(input_tokenizer.word_index) + 1
print("input_vocab_size:",input_vocab_size)
totalX = np.array(pad_sequences(input_tokenizer.texts_to_sequences(X_train), maxlen=maxLength))

input_vocab_size: 51644


In [9]:
embedding_dim = 256
num_categories = len(categories)

def create_model():
    model = Sequential()
    model.add(Embedding(input_vocab_size, embedding_dim,input_length = maxLength))
    model.add(GRU(256, dropout=0.9, return_sequences=True))
    model.add(GRU(256, dropout=0.9))
    model.add(Dense(num_categories, activation='sigmoid'))
    #model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=[precision, recall ,f1])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [10]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [11]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

In [12]:
# create model
model = KerasClassifier(build_fn=create_model, verbose=0)

In [None]:
# define the grid search parameters
batch_size = [ 80, 100]
epochs = [10, 50]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(totalX, Y_train)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 145, 256)          13220864  
_________________________________________________________________
gru_1 (GRU)                  (None, 145, 256)          393984    
_________________________________________________________________
gru_2 (GRU)                  (None, 256)               393984    
_________________________________________________________________
dense_1 (Dense)              (None, 17)                4369      
Total params: 14,013,201
Trainable params: 14,013,201
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [11]:
model.fit(totalX, Y_train, validation_split=0.1, batch_size=128, epochs=10)

Train on 9000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb0d98ebd68>

In [12]:
totalX_test = np.array(pad_sequences(input_tokenizer.texts_to_sequences(X_test), maxlen=maxLength))

In [None]:
scores = model.evaluate(totalX_test, Y_test, verbose=0)

In [None]:
scores

In [13]:
predicted = model.predict(totalX_test)

In [14]:
predicted[0]

array([0.00145262, 0.05488734, 0.01588299, 0.06202881, 0.01076351,
       0.07601661, 0.00561291, 0.01605309, 0.1513212 , 0.12173117,
       0.12067848, 0.03418665, 0.00514579, 0.11736965, 0.03171026,
       0.00121652, 0.27438316], dtype=float32)

In [15]:
for i, row in enumerate(predicted):
    for j, prob in enumerate(row):
        if prob >= 0.2:
            predicted[i][j]=1
        else:
            predicted[i][j]=0

In [16]:
predicted

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 1.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [17]:
from sklearn.metrics import classification_report
print(classification_report(np.array(Y_test), predicted, target_names=categories))

              precision    recall  f1-score   support

          00       0.00      0.00      0.00        79
          e1       0.43      0.69      0.53       940
          e2       0.14      0.07      0.09       178
          e3       0.69      0.85      0.76      1403
          e4       0.29      0.52      0.37       269
          e5       0.60      0.05      0.09       321
          e7       0.00      0.00      0.00         9
          e8       0.21      0.73      0.32       275
          e9       0.03      0.00      0.01       201
          f1       0.23      0.21      0.22       355
          f2       0.00      0.00      0.00       173
          f3       0.23      0.32      0.27       654
          f4       0.00      0.00      0.00       112
          f5       0.57      0.87      0.68      2175
          f7       0.76      0.91      0.82      3971
          f8       0.00      0.00      0.00        24
          f9       0.24      0.17      0.20       561

   micro avg       0.55   

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [18]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(np.array(Y_test), predicted, average='weighted')

(0.5406510742857371, 0.6894871794871795, 0.5916607989219517, None)