In [1]:
# LSTM and CNN for sequence classification in the IMDB dataset
from __future__ import print_function
import sys
sys.path.append('datasets')

import numpy
import codeforces
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 500
max_len = 500
embed_dim = 32
lstm_size = 100
epochs = 20
batch_size = 64
dropout = 0.2
print('Loading data...')
(x_train, y_train), (x_test, y_test) = codeforces.load_data(num_words=top_words)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
# create the model

model = Sequential()
model.add(Embedding(top_words, embed_dim, input_length=max_len))
model.add(Dropout(dropout))
model.add(Conv1D(padding="same", kernel_size=3,activation='relu',filters=64))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(lstm_size, dropout=dropout, recurrent_dropout=dropout, return_sequences=True))
model.add(Conv1D(padding="same", kernel_size=3,activation='relu',filters=64))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(lstm_size, dropout=dropout, recurrent_dropout=dropout, return_sequences=True))
model.add(Conv1D(padding="same", kernel_size=3,activation='relu',filters=64))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(lstm_size, dropout=dropout, recurrent_dropout=dropout))
model.add(Dropout(dropout))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))


Using TensorFlow backend.


Loading data...
11586 train sequences
11586 test sequences
x_train shape: (11586, 500)
x_test shape: (11586, 500)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           16000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 500, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 64)           6208      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 250, 64)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 250, 100)          66000     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 250, 64)           19264     
____________________________

<keras.callbacks.History at 0x7f8e546e2e50>

In [None]:
#评估模型好坏
#precision = TP / (TP + FP)
#recall = TP / (TP + FN)
#accuracy = (TP + TN) / (TP + FP + TN + FN)
#F1 Score = 2*P*R/(P+R)，其中P和R分别为 precision 和 recall
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
y_preds = np.round(model.predict(x_test))
result = precision_recall_fscore_support(y_test, y_preds, average='binary')
print(result)

In [None]:
#ROC曲线和AUC
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

fpr={}
tpr={}
roc_auc={}

y_score1 = model.predict(x_test)
y_score2 = model.predict(x_train)

fpr['test'], tpr['test'], _ = roc_curve(y_test, y_score1)
roc_auc['test'] = auc(fpr['test'], tpr['test'])
fpr['train'], tpr['train'], _ = roc_curve(y_train, y_score2)
roc_auc['train'] = auc(fpr['train'], tpr['train'])
    
plt.figure()
lw = 2
plt.plot(fpr['test'], tpr['test'], color='aqua',
         lw=lw, label='Test ROC curve (area = %0.2f)' % roc_auc['test'])
plt.plot(fpr['train'], tpr['train'], color='darkorange',
         lw=lw, label='Train ROC curve (area = %0.2f)' % roc_auc['train'])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')

tick_nb = 20
ticks = [x/tick_nb for x in range(1, tick_nb)]
labels = ['%.1f'%tick if tick*10%1==0 else '' for tick in ticks]
plt.xticks(ticks, labels)
plt.yticks(ticks, labels)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC for codeforces classification')
plt.legend(loc="lower right")
plt.grid()
plt.show()