In [35]:
import os
from random import shuffle
import gensim
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt
import re
from os.path import join
from gensim.models.doc2vec import Doc2Vec
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from gensim.utils import simple_preprocess
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import TaggedDocument

import logging
import multiprocessing
import tensorflow as tf
# import tensorflow.keras as keras


from csv import reader
import pandas as pd
from io import StringIO

from nltk.corpus import reuters
import nltk
nltk.download('reuters')
nltk.download('punkt')

from sklearn.preprocessing import MultiLabelBinarizer

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\eogha\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eogha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [37]:
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam

In [39]:
class D2V:

    def __init__(self):
        cores = multiprocessing.cpu_count()
        self.__model = Doc2Vec(dm=1,
                               vector_size=300,
                               min_count=2,
                               epochs=70,
                               workers=cores - 1)

    
    def train(self, train_corpus):
        self.__model.build_vocab(train_corpus)
        self.__model.train(train_corpus, total_examples=self.__model.corpus_count, epochs=self.__model.epochs)
        return 1

    def save(self, folder_path, filename):
        self.__model.save(join(folder_path, filename))

    def load(self, folder_path, filename):
        self.__model = Doc2Vec.load(join(folder_path, filename))

    def infer_doc(self, doc):
        return self.__model.infer_vector(doc)

    def get_vector(self, id):
        return self.__model.docvecs[id]

    def get_similar(self, doc):
        return self.__model.docvecs.most_similar([doc])

    def get_labels(self):
        """
        Returns the labels of all documents within the Doc2Vec model
        """
        return list(self.__model.docvecs.doctags.keys())

    def get_doc_vec(self, identifier: str):
        return self.__model.docvecs[identifier]

In [40]:
model_name = "Reuters_test.d2v"

In [41]:
# doc2vec_model_location = "G:/FYP_Work/doc2vec_models"
doc2vec_model_location = "C:/Users/eogha/Documents/Workspace/doc2vec_models"

In [42]:
taggedDocuments = [TaggedDocument(words=gensim.utils.simple_preprocess(reuters.raw(fileId)), tags=[i]) for i, fileId in enumerate(reuters.fileids())]

In [43]:
print(taggedDocuments[2])

TaggedDocument(['japan', 'to', 'revise', 'long', 'term', 'energy', 'demand', 'downwards', 'the', 'ministry', 'of', 'international', 'trade', 'and', 'industry', 'miti', 'will', 'revise', 'its', 'long', 'term', 'energy', 'supply', 'demand', 'outlook', 'by', 'august', 'to', 'meet', 'forecast', 'downtrend', 'in', 'japanese', 'energy', 'demand', 'ministry', 'officials', 'said', 'miti', 'is', 'expected', 'to', 'lower', 'the', 'projection', 'for', 'primary', 'energy', 'supplies', 'in', 'the', 'year', 'to', 'mln', 'kilolitres', 'kl', 'from', 'mln', 'they', 'said', 'the', 'decision', 'follows', 'the', 'emergence', 'of', 'structural', 'changes', 'in', 'japanese', 'industry', 'following', 'the', 'rise', 'in', 'the', 'value', 'of', 'the', 'yen', 'and', 'decline', 'in', 'domestic', 'electric', 'power', 'demand', 'miti', 'is', 'planning', 'to', 'work', 'out', 'revised', 'energy', 'supply', 'demand', 'outlook', 'through', 'deliberations', 'of', 'committee', 'meetings', 'of', 'the', 'agency', 'of', 'n

In [44]:
d2v = D2V()

In [45]:
result = d2v.train(taggedDocuments)
if result == 1:
        d2v.save(doc2vec_model_location, model_name)
        print("Training Complete!")

In [46]:
# print(result)

In [47]:
train_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('training/')]
test_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('test/')]

In [48]:
d2v.load(doc2vec_model_location, model_name)

In [49]:
labelBinarizer = MultiLabelBinarizer()
labelBinarizer.fit([reuters.categories(fileId) for fileId in reuters.fileids()])

MultiLabelBinarizer()

In [50]:
train_data = [d2v.infer_doc(gensim.utils.simple_preprocess(article['raw'])) for article in train_articles]
print("train_data Complete")
test_data = [d2v.infer_doc(gensim.utils.simple_preprocess(article['raw'])) for article in test_articles]
print("test_data Complete")
train_labels = labelBinarizer.transform([article['categories'] for article in train_articles])
print("train_labels Complete")
test_labels = labelBinarizer.transform([article['categories'] for article in test_articles])
print("test_labels Complete")
train_data, test_data, train_labels, test_labels = np.asarray(train_data), np.asarray(test_data), np.asarray(train_labels), np.asarray(test_labels)

train_data Complete
test_data Complete
train_labels Complete
test_labels Complete


In [51]:
doc2vec_dimensions = 300

In [52]:
train_data = np.reshape(train_data, (len(train_data),300,1))
print(train_data.shape)

(7769, 300, 1)


In [53]:
train_labels = np.reshape(train_labels, (len(train_labels),90,1))
print(train_labels.shape)

(7769, 90, 1)


In [54]:
test_data = np.reshape(test_data, (len(test_data),300,1))
test_labels = np.reshape(test_labels, (len(test_labels),90,1))

In [55]:
test_data.shape

(3019, 300, 1)

In [56]:
train_labels = tf.squeeze(train_labels, axis=-1)
test_labels = tf.squeeze(test_labels, axis=-1)

In [57]:
from tensorflow.keras.layers import Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import to_categorical

In [58]:
model_training = Sequential()
# input_layer = keras.Input(shape=(300,1))
model_training.add(InputLayer(input_shape=(300,1)))
model_training.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
model_training.add(Dropout(0.5))
model_training.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
model_training.add(Dropout(0.2))
model_training.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
model_training.add(Dropout(0.2))
model_training.add(MaxPooling1D(pool_size=3))
model_training.add(Dropout(0.2))
# model_training.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
# model_training.add(Dropout(0.2))
# model_training.add(MaxPooling1D(pool_size=3)) // 
# model_training.add(Dropout(0.2))
# model_training.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
# model_training.add(Dropout(0.2))
# model_training.add(MaxPooling1D(pool_size=3))
# model_training.add(Dropout(0.2))
# model_training.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
# model_training.add(Dropout(0.2))
# model_training.add(MaxPooling1D(pool_size=3))
# model_training.add(Dropout(0.2))
#model.add(Dropout(0.2))
model_training.add(Flatten())
model_training.add(Dense(train_labels.shape[1]))
model_training.add(Activation('sigmoid'))
model_training.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_training.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_3 (Conv1D)            (None, 300, 32)           128       
_________________________________________________________________
dropout_4 (Dropout)          (None, 300, 32)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 300, 32)           3104      
_________________________________________________________________
dropout_5 (Dropout)          (None, 300, 32)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 300, 32)           3104      
_________________________________________________________________
dropout_6 (Dropout)          (None, 300, 32)           0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 100, 32)          

In [59]:
# model = Sequential()
# model.add(Dense(input_dim=doc2vec_dimensions,units = 90, activation='relu'))
# model.add(Dropout(0.3))
# model.add(Dense(1200, activation='relu'))
# model.add(Dropout(0.3))
# model.add(Dense(400, activation='relu'))
# model.add(Dropout(0.3))
# model.add(Dense(600, activation='relu'))
# model.add(Dropout(0.3))
# model.add(Dense(units = train_labels.shape[1], activation='sigmoid'))
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [60]:
# model.fit(train_data, train_labels, validation_data=(test_data, test_labels), batch_size=32, epochs=15)

In [61]:
# print(train_labels[0])

In [62]:
model_training.fit(train_data, train_labels, validation_data=(test_data, test_labels), batch_size=32, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1ad19016b48>

In [63]:
test_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('test/')]
test_data = [d2v.infer_doc(gensim.utils.simple_preprocess(article['raw'])) for article in test_articles]

In [67]:
test_data = np.reshape(test_data, (len(test_data),300,1))

In [68]:
predictions = model_training.predict(np.asarray(test_data))

In [69]:
predictions[predictions<0.5] = 0
predictions[predictions>=0.5] = 1

In [70]:
labelBinarizer = MultiLabelBinarizer()
labelBinarizer.fit([reuters.categories(fileId) for fileId in reuters.fileids()])
predicted_labels = labelBinarizer.inverse_transform(predictions)

In [71]:
import collections
count = 0
print(len(predicted_labels))
for predicted_label, test_article in zip(predicted_labels, test_articles):
    # print('title: {}'.format(test_article['raw'].splitlines()[0]))
    # print('predicted: {} - actual: {}'.format(list(predicted_label), test_article['categories']))
    # print('')
    if collections.Counter(list(predicted_label)) == collections.Counter(test_article['categories']):
        count = count + 1
    # print(count)
print(count)

3019
1425


In [72]:
print(count/len(predicted_labels))

0.4720105995362703


In [64]:
test_data = [d2v.infer_doc(gensim.utils.simple_preprocess(article['raw'])) for article in test_articles]
test_labels = labelBinarizer.transform([article['categories'] for article in test_articles])

In [65]:
test_data = np.reshape(test_data, (len(test_data),300,1))
test_labels = np.reshape(test_labels, (len(test_labels),90,1))

In [66]:
loss, acc = model_training.evaluate(test_data, test_labels, batch_size=128)
    
print('Score: %1.4f' % loss)
print('Accuracy: %1.4f' % acc)

Score: 0.0438
Accuracy: 0.7175
