In [10]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.layers.core import Dense, Dropout, Activation

[nltk_data] Downloading package punkt to /home/anna/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def prepare_data(filename):
    data = pd.read_csv(filename, sep="\t")
    data = data[['text', 'subj']]
    data['subj'] = data['subj'].apply(lambda subj: subj.split('\\'))
    data = data.sample(n=10000)
    mlb = MultiLabelBinarizer()
    encoded_subjects = pd.DataFrame(mlb.fit_transform(data.pop('subj')), columns=mlb.classes_, index=data.index)
    data = data.join(encoded_subjects)
    return data, mlb.classes_

In [3]:
train, categories = prepare_data('learn.txt')
test, _ = prepare_data('test.txt')

In [4]:
subjects = pd.read_csv('subjects.txt', sep="\t", header=None, names=['code', 'desc_rus', 'description'])[['code', 'description']]

In [5]:
X_train = train.text
X_test = test.text
Y_train = train[categories]
Y_test = test[categories]

In [6]:
xLengths = [len(word_tokenize(x)) for x in X_train]
h = sorted(xLengths)  #sorted lengths
maxLength = h[int(len(h) * 0.70)]

In [7]:
max_vocab_size = 200000
input_tokenizer = Tokenizer(max_vocab_size)
input_tokenizer.fit_on_texts(X_train)
input_vocab_size = len(input_tokenizer.word_index) + 1
print("input_vocab_size:",input_vocab_size)
X_train = np.array(pad_sequences(input_tokenizer.texts_to_sequences(X_train), maxlen=maxLength))
X_test = np.array(pad_sequences(input_tokenizer.texts_to_sequences(X_test), maxlen=maxLength))

input_vocab_size: 51192


In [8]:
embedding_dim = 256
num_categories = len(categories)
pool_length = 4
lstm_output_size = 100
batch_size = 200
nb_epoch = 10

In [12]:
model = Sequential()
model.add(Embedding(input_vocab_size, embedding_dim,input_length = maxLength))
    
model.add(Dropout(0.8713141896816126))
model.add(Conv1D(filters=64,kernel_size=8,
                            padding='valid',
                            activation='relu',
                            strides=1))
model.add(MaxPooling1D(pool_size=pool_length))
model.add(LSTM(lstm_output_size))
model.add(Dense(num_categories))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 144, 256)          13105152  
_________________________________________________________________
dropout_1 (Dropout)          (None, 144, 256)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 137, 64)           131136    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 34, 64)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_1 (Dense)              (None, 17)                1717      
_________________________________________________________________
activation_1 (Activation)    (None, 17)                0         
Total para

In [43]:
model.fit(
            X_train, 
            Y_train, 
            batch_size=batch_size, 
            epochs=nb_epoch,
            verbose=2,
            validation_split=0.1
         )

NameError: name 'train_instances' is not defined

In [14]:
scores = model.evaluate(X_test, Y_test, verbose=0)

In [15]:
scores

[0.1479432078361511, 0.9536529140472412]

In [16]:
predicted = model.predict(X_test)

In [17]:
predicted[0]

array([0.00211349, 0.08069116, 0.03202873, 0.01202524, 0.01524477,
       0.05160558, 0.00356596, 0.0455012 , 0.02886715, 0.07177214,
       0.04146872, 0.03223361, 0.011503  , 0.26273057, 0.01791821,
       0.00116026, 0.11917093], dtype=float32)

In [46]:
for i, row in enumerate(predicted):
    for j, prob in enumerate(row):
        if prob >= 0.5:
            predicted[i][j]=1
        else:
            predicted[i][j]=0

In [47]:
predicted

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [48]:
from sklearn.metrics import classification_report
print(classification_report(np.array(Y_test), predicted, target_names=categories))

              precision    recall  f1-score   support

          00       0.00      0.00      0.00        78
          e1       0.36      0.77      0.49       986
          e2       0.00      0.00      0.00       181
          e3       0.70      0.87      0.77      1465
          e4       0.14      0.45      0.21       308
          e5       0.00      0.00      0.00       343
          e7       0.00      0.00      0.00         5
          e8       0.13      0.81      0.23       302
          e9       0.00      0.00      0.00       212
          f1       0.00      0.00      0.00       374
          f2       0.00      0.00      0.00       199
          f3       0.24      0.16      0.19       616
          f4       0.00      0.00      0.00       116
          f5       0.56      0.79      0.66      2052
          f7       0.86      0.79      0.83      3866
          f8       0.00      0.00      0.00        40
          f9       0.00      0.00      0.00       573

   micro avg       0.53   

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [49]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(np.array(Y_test), predicted, average='weighted')

(0.5206369450960237, 0.6149709798566063, 0.5473430430050416, None)

In [51]:
from sklearn.metrics import f1_score
print('sklearn Macro-F1-Score:', f1_score(np.array(Y_train), np.array(Y_test), average='weighted'))

sklearn Macro-F1-Score: 0.20928447141731724
