In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
import gensim
from sklearn.decomposition import FactorAnalysis
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score,precision_recall_curve, roc_auc_score, f1_score, fbeta_score, auc

Using TensorFlow backend.


In [2]:
import pandas as pd
import numpy as np

data = pd.read_excel('clas_data.xlsx').drop('Unnamed: 0', axis='columns')

In [3]:
%%time

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data['clas'])
data['clas'] = le.transform(data['clas'])

CPU times: user 5.21 ms, sys: 114 µs, total: 5.33 ms
Wall time: 5.25 ms


In [4]:
%%time

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['summ_lemma'], data['clas'], test_size=0.12, stratify=data['clas'])

CPU times: user 5.07 ms, sys: 102 µs, total: 5.18 ms
Wall time: 5.02 ms


In [22]:
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 200
VOCAB_SIZE = 80000

tokenizer = Tokenizer(num_words=VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(data['summ_lemma'].tolist())
training_sequences = tokenizer.texts_to_sequences(X_train.tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)
from navec import Navec

path = 'navec_news_v1_1B_250K_300d_100q.tar'
word2vec = Navec.load(path)
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

test_sequences = tokenizer.texts_to_sequences(X_test.tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels_train = to_categorical(np.asarray(y_train), num_classes=len(le.classes_), dtype='float64')
labels_test = to_categorical(np.asarray(y_test), num_classes=len(le.classes_), dtype='float64')

Found 23954 unique tokens.
(23955, 300)


In [23]:
from tensorflow.keras.layers import Dense, Input, Flatten, Dropout, concatenate
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index, trainable=False, extra_conv=True):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=trainable)

    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    #https://arxiv.org/abs/1408.5882
    convs = []
    filter_sizes = [3,4,5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)

    l_merge = concatenate(convs,axis=1)

    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
    pool = MaxPooling1D(pool_size=3)(conv)

    if extra_conv==True:
        x = Dropout(0.5)(l_merge)  
    else:
        # Original
        x = Dropout(0.5)(pool)
    x = Flatten()(x)
    x = Dense(128, activation='relu',kernel_regularizer='l1_l2')(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=[keras.metrics.CategoricalAccuracy(),keras.metrics.Precision(),keras.metrics.Recall()])
    model.summary()
    return model

In [24]:
del model

In [25]:
from tensorflow import keras
import tensorflow as tf



model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH,
                len(train_word_index)+1, EMBEDDING_DIM, len(list(le.classes_)), False)

early_stopping = EarlyStopping(monitor='val_categorical_accuracy', min_delta=0.02, patience=4, verbose=1,mode = 'max')
callbacks_list = [early_stopping]

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 200, 300)     7186500     input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_8 (Conv1D)               (None, 198, 128)     115328      embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 197, 128)     153728      embedding_2[0][0]                
____________________________________________________________________________________________

In [26]:
hist = model.fit(train_cnn_data, labels_train,
                 callbacks=callbacks_list, 
                 validation_data=(test_cnn_data, labels_test),
                 epochs=25, 
                 batch_size=256)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 00007: early stopping


In [27]:
model_w2v_pred = model.predict(test_cnn_data)
print(model_w2v_pred)

[[1.1470386e-05 9.9797523e-01 2.0718575e-04 ... 3.6019087e-04
  3.7425756e-04 7.6007843e-04]
 [7.8430772e-04 9.9759412e-01 3.5792589e-04 ... 6.1092719e-06
  3.9671664e-05 4.1635036e-03]
 [1.1403396e-05 1.9850924e-05 6.8085404e-09 ... 9.9995244e-01
  1.1116117e-06 2.5986486e-07]
 ...
 [4.2509735e-03 1.2557188e-05 2.6159923e-05 ... 9.4246244e-01
  3.2940507e-04 1.1045306e-04]
 [7.2537121e-05 6.4400128e-06 9.9994600e-01 ... 4.0625791e-06
  1.8525124e-04 6.4400039e-05]
 [6.1796945e-06 1.1671710e-04 2.2435084e-05 ... 9.9895710e-01
  8.2483748e-05 1.7181039e-04]]


In [20]:
cnn_nav_test = []
for i in model_w2v_pred:
    cnn_nav_test.append(np.argmax(i))
print('Accuracy:', accuracy_score(y_test, cnn_nav_test))
print('Precision:', precision_score(y_test, cnn_nav_test,average='macro'))           
print('Recall:', recall_score(y_test, cnn_nav_test,average='macro'))
print('F1:', f1_score(y_test, cnn_nav_test,average='macro'))
print('F0.5:', fbeta_score(y_test, cnn_nav_test, beta = 0.5,average='macro'))
print('F2:', fbeta_score(y_test, cnn_nav_test, beta = 2,average='macro'))

Accuracy: 0.8363780778395552
Precision: 0.8334531286613687
Recall: 0.8384656603902109
F1: 0.8345254344732432
F0.5: 0.8335448137770236
F2: 0.8365361903084823


In [28]:
cnn_nav_test = []
for i in model_w2v_pred:
    cnn_nav_test.append(np.argmax(i))
print('Accuracy:', accuracy_score(y_test, cnn_nav_test))
print('Precision:', precision_score(y_test, cnn_nav_test,average='macro'))           
print('Recall:', recall_score(y_test, cnn_nav_test,average='macro'))
print('F1:', f1_score(y_test, cnn_nav_test,average='macro'))
print('F0.5:', fbeta_score(y_test, cnn_nav_test, beta = 0.5,average='macro'))
print('F2:', fbeta_score(y_test, cnn_nav_test, beta = 2,average='macro'))

Accuracy: 0.8260524225575854
Precision: 0.8398643066774344
Recall: 0.8180115897250012
F1: 0.8256922956764837
F0.5: 0.8334146951156821
F2: 0.8202928800857534


In [10]:
cnn_nav_test = []
for i in model_w2v_pred:
    cnn_nav_test.append(np.argmax(i))
print('Accuracy:', accuracy_score(y_test, cnn_nav_test))
print('Precision:', precision_score(y_test, cnn_nav_test,average='macro'))           
print('Recall:', recall_score(y_test, cnn_nav_test,average='macro'))
print('F1:', f1_score(y_test, cnn_nav_test,average='macro'))
print('F0.5:', fbeta_score(y_test, cnn_nav_test, beta = 0.5,average='macro'))
print('F2:', fbeta_score(y_test, cnn_nav_test, beta = 2,average='macro'))

Accuracy: 0.8196981731532963
Precision: 0.8212842556445457
Recall: 0.8118389313759845
F1: 0.8158182983415764
F0.5: 0.8189132235869094
F2: 0.8132578400970272


In [21]:
model.save('cnn_lex_150.h5') #108

In [20]:
model.save('cnn_lex_150.h5') #108

In [13]:
model.save('cnn_nn_250.h5') #96

In [12]:
new_model = tf.keras.models.load_model('cnn_pgn_150_new.h5')

In [None]:
new