In [1]:
import keras
from keras import regularizers
import numpy as np
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import RMSprop
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential, Input
from keras.layers import Dense, Dropout, Embedding, SpatialDropout1D
from keras.layers import LSTM, Bidirectional, Activation
from keras.callbacks import ModelCheckpoint
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
from sklearn.metrics import roc_auc_score 
import matplotlib.pyplot as plt 
%matplotlib inline

Using TensorFlow backend.


In [2]:
#hyperparamter
DATA_DIR = "../data/"
output_dir = '../data/lstm/'

#data
test_size = 0.1
random_state = 12345

#Training 
epochs = 4
batch_size = 128

#vector-space embedding
n_dim = 64
n_unique_words = 5000
max_review_length = 100
pad_type = trunc_type = 'pre'
drop_embed = 0.25

#lstm architecture 
n_lstm = 256
drop_lstm = 0.2

#optimizer
rms

In [3]:
# load the data
data, labels = [],[]
for file in tqdm(os.listdir(DATA_DIR+'pickled/')):
    with open(DATA_DIR+'pickled/'+file,'rb') as f:
        text,label = pickle.load(f)
        data.append(text)
        labels.append(label)

100%|██████████| 51915/51915 [02:19<00:00, 371.38it/s]


In [4]:
len(data)

51915

In [5]:
one_hot = MultiLabelBinarizer()
labels = one_hot.fit_transform(labels)
len(labels[0])

62

In [6]:
data = np.array(data)
labels = np.array(labels)


In [7]:
train_x,test_x, train_y, test_y = train_test_split(data, labels, test_size=test_size, random_state=random_state)

In [8]:
len(train_x), len(train_y), len(test_x), len(test_y)

(46723, 46723, 5192, 5192)

In [9]:
train_y = [x for x in train_y.transpose()]
test_y = [x for x in test_y.transpose()]

In [10]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [11]:
MAX_WORDS=25000
train_with_cat=[]
MAX_LEN=100

tok_1=Tokenizer(num_words=MAX_WORDS)
tok_1.fit_on_texts(train_x)                                                       #converting text input to vector
sequences = tok_1.texts_to_sequences(train_x)                                     #to convert sentence to sequences of numbers
sequences_matrix = sequence.pad_sequences(sequences, maxlen=MAX_LEN)
def RNN_1():
    inputs = Input(name='inputs', shape=[MAX_LEN])
    layer = Embedding(MAX_WORDS, 64, input_length=MAX_LEN)(inputs)  
    layer = Bidirectional(LSTM(64,recurrent_dropout=0.1,kernel_regularizer=regularizers.l2(0.000014),activity_regularizer=regularizers.l1(0.00012)))(layer)
    layer_lst=[]
    sigmoid_lst=[]
    for i in range(len(train_y)):
        layer_temp=Dense(64,name='FC_1'+str(i),kernel_regularizer=regularizers.l2(0.000013))(layer)
        temp=Dense(1,name='FC_2'+str(i),kernel_regularizer=regularizers.l2(0.000013))(layer_temp)
        sigmoid_lst.append(Activation('sigmoid')(temp))

    model = Model(inputs=inputs,outputs=sigmoid_lst)
    return model
model = RNN_1()
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 64)      1600000     inputs[0][0]                     
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 128)          66048       embedding_1[0][0]                
__________________________________________________________________________________________________
FC_10 (Dense)        

In [13]:
model.compile(loss='binary_crossentropy',optimizer=RMSprop(lr=0.0007),metrics=['accuracy'])
modelcheckpoint = ModelCheckpoint(filepath=output_dir+"/weights.{epoch:02d}.hdf5", save_best_only=True)
model.fit(sequences_matrix,
          train_y,
          batch_size=batch_size,
          epochs=epochs,
          verbose=2,
          validation_split=0.1, 
          callbacks=[modelcheckpoint])
model.save('LSTM2_main.h5')

Train on 42050 samples, validate on 4673 samples
Epoch 1/20
Epoch 2/20


Epoch 3/20


  128/42050 [..............................] - ETA: 51s - loss: 4.2961 - activation_1_loss: 1.5018e-04 - activation_2_loss: 0.0647 - activation_3_loss: 0.1545 - activation_4_loss: 0.0335 - activation_5_loss: 2.2392e-04 - activation_6_loss: 7.1360e-05 - activation_7_loss: 0.0456 - activation_8_loss: 0.1067 - activation_9_loss: 2.0719e-04 - activation_10_loss: 0.0965 - activation_11_loss: 0.2129 - activation_12_loss: 1.3776e-04 - activation_13_loss: 0.3368 - activation_14_loss: 0.0554 - activation_15_loss: 0.1020 - activation_16_loss: 0.2062 - activation_17_loss: 0.1615 - activation_18_loss: 0.0601 - activation_19_loss: 0.0546 - activation_20_loss: 4.9872e-06 - activation_21_loss: 0.0720 - activation_22_loss: 0.0614 - activation_23_loss: 0.0967 - activation_24_loss: 0.1120 - activation_25_loss: 6.5136e-04 - activation_26_loss: 0.0469 - activation_27_loss: 0.0359 - activation_28_loss: 0.0013 - activation_29_loss: 0.0012 - activation_30_loss: 0.0473 - activation_31_loss: 0.0320 - activatio



KeyboardInterrupt: 