-
Notifications
You must be signed in to change notification settings - Fork 0
/
CNN.py
98 lines (89 loc) · 4.22 KB
/
CNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import numpy as np
import os
import json
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
import random as rn
import keras
import tensorflow as tf
from keras import backend as K
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
#All this for reproducibility
np.random.seed(1)
rn.seed(1)
tf.set_random_seed(1)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
keras.backend.set_session(sess)
# Build the corpus and sequences
with open ('E:\Xplordat\Article 1\Custom Data\words.txt' , 'r') as f:
words = sorted(list(set(f.read().lower().strip().split(','))))
X, labels = [], []
labelToName = { 0 : 'ordered', 1 : 'unordered', 2 : 'reversed' }
namesInLabelOrder = ['ordered', 'unordered', 'reversed']
nWords = len(words)
sequenceLength=15
for i in range(0, nWords-sequenceLength):
X.append(words[i:i+sequenceLength])
labels.append(0)
for i in range(nWords-sequenceLength, nWords):
X.append(words[i:nWords] + words[0:sequenceLength + i -nWords])
labels.append(0)
nSegments = len(X)
for i in range(nSegments):
X.append(X[i][::-1])
labels.append(1)
for i in range(nSegments):
randIndices = np.random.randint(0, size=sequenceLength, high=nWords)
X.append(list( words[i] for i in randIndices ))
labels.append(2)
# Encode the documents
kTokenizer = keras.preprocessing.text.Tokenizer()
kTokenizer.fit_on_texts(X)
Xencoded = np.array([np.array(xi) for xi in kTokenizer.texts_to_sequences(X)])
labels = np.array(labels)
# Implementing the CNN Model
def getModel():
units1, units2 = int (nWords/4), int (nWords/8)
model = keras.models.Sequential()
model.add(keras.layers.embeddings.Embedding(input_dim = len(kTokenizer.word_index)+1,output_dim=units1,input_length=sequenceLength, trainable=True))
model.add(keras.layers.Conv1D(16, 3, activation='relu', padding='valid'))
model.add(keras.layers.MaxPooling1D(3, padding='valid'))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(len(labelToName), activation ='softmax'))
model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics=['acc'])
return model
#Test & Train Split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1).split(Xencoded, labels)
train_indices, test_indices = next(sss)
train_x = Xencoded[train_indices]
test_x = Xencoded[test_indices]
train_labels = keras.utils.to_categorical(labels[train_indices], len(labelToName))
test_labels = keras.utils.to_categorical(labels[test_indices], len(labelToName))
# Train & test over multiple train/valid sets
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=2, mode='auto', restore_best_weights=False)
sss2 = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=1).split(train_x, train_labels)
results = {}
for i in range(10):
result = {}
train_indices_2, val_indices = next(sss2)
model = getModel()
history=model.fit(x=train_x[train_indices_2], y=train_labels[train_indices_2], epochs=20, batch_size=32, shuffle=True, validation_data = (train_x[val_indices], train_labels[val_indices]), verbose=2, callbacks=[early_stop])
result['history'] = history.history
test_loss, test_accuracy = model.evaluate(test_x, test_labels, verbose=2)
result['test_loss'], result['test_accuracy'] = test_loss, test_accuracy
print (test_loss, test_accuracy)
predicted = model.predict(test_x, verbose=2)
predicted_labels = predicted.argmax(axis=1)
print("Model Summary is:", model.summary())
print ('Confusion Matrix')
print (confusion_matrix(labels[test_indices], predicted_labels))
print ('Classification Report')
print (classification_report(labels[test_indices], predicted_labels, digits=4, target_names=namesInLabelOrder))
result['confusion_matrix'] = confusion_matrix(labels[test_indices], predicted_labels).tolist()
result['classification_report'] = classification_report(labels[test_indices], predicted_labels, digits=4, target_names=namesInLabelOrder, output_dict=True)
results[i] = result
f = open ('E:\Custom Data\Results\CNN.json','w')
out = json.dumps(results, ensure_ascii=True)
f.write(out)
f.close()