In [12]:
# from nltk.corpus import reuters
# def collection_stats():
#     # List of documents
#     documents = reuters.fileids()
#     print(str(len(documents)) + " documents");
 
#     train_docs = list(filter(lambda doc: doc.startswith("train"),
#                         documents));
#     print(str(len(train_docs)) + " total train documents");
 
#     test_docs = list(filter(lambda doc: doc.startswith("test"),
#                        documents));
#     print(str(len(test_docs)) + " total test documents");
 
#     # List of categories
#     categories = reuters.categories();
#     print(str(len(categories)) + " categories");
    
#     print(categories)

#     # Documents in a category
#     category_docs = reuters.fileids("acq");
 
#     # Words for a document
#     document_id = category_docs[0]
#     document_words = reuters.words(category_docs[0]);
#     # print(document_words);  
#     print(document_id)
#     # Raw document
#     # print(reuters.raw(document_id));

In [13]:
import os
import gensim
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt
import re
from os.path import join
from gensim.models.doc2vec import Doc2Vec
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from gensim.utils import simple_preprocess
import logging
import multiprocessing

In [14]:
class FileReader:
    def __init__(self):
        # self.__testing_path = "C:/Users/eogha/Documents/Workspace/FYP_Datasets/Large_Movie_dataset/aclImdb/test"
        # self.__training_path = "C:/Users/eogha/Documents/Workspace/FYP_Datasets/Large_Movie_dataset/aclImdb/train"
        # self.__models_paths = "C:/Users/eogha/Documents/Workspace/doc2vec_models"
        self.__models_paths = "G:/FYP_Work/doc2vec_models"
        self.__training_path = "G:/FYP_Work/FYP_Datasets/Large_Movie_dataset/aclImdb/train"
        self.__testing_path = "G:/FYP_Work/FYP_Datasets/Large_Movie_dataset/aclImdb/test"

    def read_corpus_train(self):
        # path_M = r'G:/FYP_Work/FYP_Datasets/Large_Movie_dataset/aclImdb/train/pos//'
        # positive_M_files = glob.glob(path_M + "*.txt")

        folders = dirs = os.listdir(self.__training_path)

        #Go through each folder in the training dataset.
        for folder in dirs:
            print("Current folder: {}".format(folder))
            curr_path = os.path.join(self.__training_path, folder)
            docs = os.listdir(curr_path)
            for i, document in enumerate(docs):
                curr_doc_write = os.path.join(curr_path, document)
                with open(curr_doc_write, mode="r", encoding="utf-8") as file:
                    content = file.read()
                    doc_id = folder + "__" + str(i)
                    yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(content), [doc_id])

    def get_models_path(self): return self.__models_paths
    def get_training_path(self): return self.__training_path
    def get_testing_path(self): return self.__testing_path

In [15]:
class D2V:

    def __init__(self):
        cores = multiprocessing.cpu_count()
        self.__model = Doc2Vec(dm=0,
                               vector_size=300,
                               min_count=2,
                               epochs=70,
                               workers=cores - 1)

    
    def train(self, train_corpus):
        self.__model.build_vocab(train_corpus)
        self.__model.train(train_corpus, total_examples=self.__model.corpus_count, epochs=self.__model.epochs)
        return 1

    def save(self, folder_path, filename):
        self.__model.save(join(folder_path, filename))

    def load(self, folder_path, filename):
        self.__model = Doc2Vec.load(join(folder_path, filename))

    def infer_doc(self, doc):
        return self.__model.infer_vector(doc)

    def get_vector(self, id):
        return self.__model.docvecs[id]

    def get_similar(self, doc):
        return self.__model.docvecs.most_similar([doc])

    def get_labels(self):
        """
        Returns the labels of all documents within the Doc2Vec model
        """
        return list(self.__model.docvecs.doctags.keys())

    def get_doc_vec(self, identifier: str):
        return self.__model.docvecs[identifier]

In [16]:
class NN:

    def __init__(self):
        self.__topics = list()
        self.clf = MLPClassifier(activation='tanh', 
                                 learning_rate="adaptive", 
                                 learning_rate_init=0.001,
                                 solver="adam",
                                 max_iter=500)

    def train(self, x: np.array, y: np.array) -> None:
        result = self.clf.fit(x, y)
        print("Training error: {}".format(result.score(x, y)))

    def predict_probability(self, x: np.array) -> np.array:
        val = self.clf.predict_proba(x)
    
        return val

    def predict(self, x: np.array) -> np.array:
        val = self.clf.predict(x)
    
        return val
    
    def get_topics(self): return self.__top

    def add_topic(self, t: str):
        if t not in self.__topics:
            self.__topics.append(t)

    def get_topic_vector(self, t: str):
        topic_vec = list()
        for topic in self.__topics:
            if t == topic:
                topic_vec.append(1)
            else:
                topic_vec.append(0)

        return topic_vec

In [17]:
train_topics = list()
train_docs = list()
test_topics = list()
test_docs = list()

In [18]:
def load_testset():
    test_topics.clear()
    test_docs.clear()

    print("Loading test dataset")
    topics = classifier.get_topics()
    print("TOPICS: %s" % topics)

    for topic in topics:
        print("Current topic: %s" % topic)
        file_location = os.path.join(reader.get_testing_path(), topic)
        files = os.listdir(file_location)

        for file in files:
            with open(os.path.join(file_location, file), mode="r", encoding="utf-8") as file:
                content = file.read()
            cleaned_doc = simple_preprocess(content)

            test_topics.append(topic)
            test_docs.append(model.infer_doc(cleaned_doc))

    for i in range(len(test_topics)):
        test_topics[i] = classifier.get_topic_vector(test_topics[i])

    print("Finished loading test set")

In [19]:
def load_trainset():
    """
    Use this function when a trained Doc2Vec model exists. This function assumes
    that a Doc2Vec model is already loaded into the program. The NN classifier
    is used to store the necessary topics into the program.
    """

    print("Loading Training Set")
    doc_labels = model.get_labels()
    print(doc_labels)
    train_topics.clear()
    train_docs.clear()

    for label in doc_labels:
        # print(label)
        train_docs.append(model.get_doc_vec(label))
        # print(train_docs)
        split_string = label.split("__")

        train_topics.append(split_string[0])
        classifier.add_topic(split_string[0])

    for i in range(len(train_topics)):
        train_topics[i] = classifier.get_topic_vector(train_topics[i])
        # print(train_topics[i])

In [20]:
def train_d2v():
    train_corpus = list(reader.read_corpus_train())
    result = model.train(train_corpus)
    if result == 1:
        model.save(reader.get_models_path(), model_name)
        print("Training Complete!")

In [21]:
reader = FileReader()
model = D2V()
model_name = "Test_1.d2v"
classifier = NN()
# train_d2v()

In [22]:
model.load(reader.get_models_path(), model_name)
load_trainset()
classifier.train(np.array(train_docs, ndmin=2), np.array(train_topics, ndmin=2))
print("Training Complete")

Loading Training Set
['neg__0', 'neg__1', 'neg__2', 'neg__3', 'neg__4', 'neg__5', 'neg__6', 'neg__7', 'neg__8', 'neg__9', 'neg__10', 'neg__11', 'neg__12', 'neg__13', 'neg__14', 'neg__15', 'neg__16', 'neg__17', 'neg__18', 'neg__19', 'neg__20', 'neg__21', 'neg__22', 'neg__23', 'neg__24', 'neg__25', 'neg__26', 'neg__27', 'neg__28', 'neg__29', 'neg__30', 'neg__31', 'neg__32', 'neg__33', 'neg__34', 'neg__35', 'neg__36', 'neg__37', 'neg__38', 'neg__39', 'neg__40', 'neg__41', 'neg__42', 'neg__43', 'neg__44', 'neg__45', 'neg__46', 'neg__47', 'neg__48', 'neg__49', 'neg__50', 'neg__51', 'neg__52', 'neg__53', 'neg__54', 'neg__55', 'neg__56', 'neg__57', 'neg__58', 'neg__59', 'neg__60', 'neg__61', 'neg__62', 'neg__63', 'neg__64', 'neg__65', 'neg__66', 'neg__67', 'neg__68', 'neg__69', 'neg__70', 'neg__71', 'neg__72', 'neg__73', 'neg__74', 'neg__75', 'neg__76', 'neg__77', 'neg__78', 'neg__79', 'neg__80', 'neg__81', 'neg__82', 'neg__83', 'neg__84', 'neg__85', 'neg__86', 'neg__87', 'neg__88', 'neg__89'

In [23]:
load_testset()
test_guesses = classifier.predict(np.array(test_docs, ndmin=2))
print(metrics.accuracy_score(np.array(test_topics, ndmin=2),test_guesses))
print("Testing Complete")

Loading test dataset


AttributeError: 'NN' object has no attribute '_NN__top'

In [36]:
train_topics = list()
train_docs = list()
test_topics = list()
test_docs = list()

In [37]:
load_trainset()
load_testset()

Loading Training Set
Loading test dataset
TOPICS: ['neg', 'pos']
Current topic: neg
Current topic: pos
Finished loading test set


In [38]:
import tensorflow as tf
import tensorflow.keras as keras

from tensorflow.keras.layers import Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import LSTM
from keras.utils import to_categorical

Using TensorFlow backend.


In [39]:
print(type(train_topics)) 
print(type(train_docs)) 
print(type(test_topics)) 
print(type(test_docs)) 

<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>


In [40]:
X_train = np.array(train_docs)
Y_train = np.array(test_docs)
X_test = np.array(train_topics)
Y_test = np.array(test_topics)

In [41]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(25000, 300)
(25000, 2)
(25000, 300)
(25000, 2)


In [42]:
X_train = np.reshape(X_train, (len(X_train),300,1))
X_test = np.reshape(X_test, (len(X_test),2,1))
Y_train = np.reshape(Y_train, (len(Y_train),300,1))
Y_test = np.reshape(Y_test, (len(Y_test),2,1))
# X_train.shape


In [43]:
X_test = tf.squeeze(X_test, axis=-1)
Y_test = tf.squeeze(Y_test, axis=-1)

In [44]:
# model_training = Sequential()
# # input_layer = keras.Input(shape=(300,1))
# model_training.add(InputLayer(input_shape=(300,1)))
# model_training.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
# model_training.add(Dropout(0.2))
# model_training.add(MaxPooling1D(pool_size=3))
# model_training.add(Dropout(0.2))
# model_training.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
# model_training.add(Dropout(0.2))
# model_training.add(MaxPooling1D(pool_size=3))
# # model_training.add(Dropout(0.2))
# # model_training.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
# # model_training.add(Dropout(0.2))
# # model_training.add(MaxPooling1D(pool_size=3))
# # model_training.add(Dropout(0.2))
# # model_training.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
# # model_training.add(Dropout(0.2))
# # model_training.add(MaxPooling1D(pool_size=3))
# # model_training.add(Dropout(0.2))
# #model.add(Dropout(0.2))
# model_training.add(Flatten())
# model_training.add(Dense(2))
# model_training.add(Activation('sigmoid'))
# model_training.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model_training.summary())

In [45]:
model_training = Sequential()
# input_layer = keras.Input(shape=(300,1))
model_training.add(InputLayer(input_shape=(300,1)))
model_training.add(Conv1D(filters=32, kernel_size=8, padding='same', activation='relu'))
model_training.add(Dropout(0.5))
model_training.add(Dense(512))
model_training.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
model_training.add(Dropout(0.25))
model_training.add(Dense(256))
model_training.add(Dense(128))
# model_training.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
# model_training.add(Dropout(0.2))
# model_training.add(Dense(128)) 
model_training.add(MaxPooling1D(pool_size=3))
# model_training.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
# model_training.add(Dropout(0.2))
# model_training.add(MaxPooling1D(pool_size=3))
# model_training.add(Dropout(0.2))
# model_training.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
# model_training.add(Dropout(0.2))
# model_training.add(MaxPooling1D(pool_size=3))
# model_training.add(Dropout(0.2))
# model_training.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
# model_training.add(Dropout(0.2))
# model_training.add(MaxPooling1D(pool_size=3))
# model_training.add(Dropout(0.2))
#model.add(Dropout(0.2))
model_training.add(Flatten())
model_training.add(Dense(2))
model_training.add(Activation('sigmoid'))
model_training.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_training.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 300, 32)           288       
_________________________________________________________________
dropout (Dropout)            (None, 300, 32)           0         
_________________________________________________________________
dense (Dense)                (None, 300, 512)          16896     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 300, 32)           65568     
_________________________________________________________________
dropout_1 (Dropout)          (None, 300, 32)           0         
_________________________________________________________________
dense_1 (Dense)              (None, 300, 256)          8448      
_________________________________________________________________
dense_2 (Dense)              (None, 300, 128)          3

In [46]:
model_training.fit(X_train, X_test, validation_data=(Y_train, Y_test), batch_size=128, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50

KeyboardInterrupt: 

In [41]:
# Final evaluation of the model
scores = model_training.evaluate(Y_train, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 82.40%


In [42]:
scores

[0.39201077818870544, 0.82396000623703]

In [24]:
train_path = "G:/FYP_Work/FYP_Datasets/Reuters_Dataset/reuters/reuters/reuters/Train"
# train_path =  "C:/Users/eogha/Documents/Workspace/FYP_Datasets/Reuters_Dataset/reuters/reuters/reuters/Train"

In [25]:
label_path = "G:/FYP_Work/FYP_Datasets/Reuters_Dataset/reuters/reuters/reuters/Labels"
# label_path = "C:/Users/eogha/Documents/Workspace/FYP_Datasets/Reuters_Dataset/reuters/reuters/reuters/Labels"

In [26]:
from csv import reader
import pandas as pd
from io import StringIO

In [27]:
folders = dirs = os.listdir(label_path)

li = []
training_temp = []
test_temp = []
substring = "training"
#Go through each folder in the training dataset.
for folder in dirs:
    print("Current folder: {}".format(folder))
    curr_path = os.path.join(label_path, folder)
    docs = os.listdir(curr_path)
    for i, document in enumerate(docs):
        curr_doc_write = os.path.join(curr_path, document)
        with open(curr_doc_write, mode="r", encoding="utf-8") as file:
            for line in file:
                if(substring in line):
                    training_temp.append(line.split(" "))
                else:
                    test_temp.append(line.split(" "))
            doc_id = folder + "__" + str(i)
            print(doc_id)
            # yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(content), [doc_id])

Current folder: label
label__0


In [28]:
labels = np.array(training_temp)
print(labels)

[list(['training/1', 'cocoa\n'])
 list(['training/5', 'sorghum', 'oat', 'barley', 'corn', 'wheat', 'grain\n'])
 list(['training/6', 'wheat', 'sorghum', 'grain', 'sunseed', 'corn', 'oilseed', 'soybean', 'sun-oil', 'soy-oil', 'lin-oil', 'veg-oil\n'])
 ... list(['training/14785', 'money-fx\n'])
 list(['training/14805', 'copper\n']) list(['training/14818', 'ship\n'])]


In [29]:
#remove last 2 characters of the last element in the list. as it added \n to it. 
for index, array in enumerate(labels):
    x = (array[0].split("/"))
    temp_len = len(array)
    labels[index][0] = x[1]
    tmp = array[temp_len-1]
    labels[index][temp_len-1] = tmp[:-1]

In [30]:
for each in labels:
    print(each)

['1', 'cocoa']
['5', 'sorghum', 'oat', 'barley', 'corn', 'wheat', 'grain']
['6', 'wheat', 'sorghum', 'grain', 'sunseed', 'corn', 'oilseed', 'soybean', 'sun-oil', 'soy-oil', 'lin-oil', 'veg-oil']
['9', 'earn']
['10', 'acq']
['11', 'earn']
['12', 'acq', 'earn']
['13', 'earn']
['14', 'earn']
['18', 'earn']
['19', 'grain', 'wheat']
['22', 'copper']
['23', 'earn']
['24', 'earn']
['27', 'earn']
['29', 'housing']
['30', 'money-supply']
['36', 'earn']
['37', 'earn']
['38', 'earn']
['40', 'earn']
['41', 'earn']
['42', 'coffee']
['44', 'ship', 'acq']
['45', 'acq']
['46', 'sugar']
['47', 'trade']
['48', 'reserves']
['49', 'ship']
['50', 'earn']
['53', 'earn']
['56', 'earn']
['57', 'corn', 'grain']
['58', 'money-supply']
['59', 'ship']
['64', 'earn']
['65', 'earn']
['66', 'earn']
['68', 'acq']
['69', 'soy-meal', 'meal-feed', 'oilseed', 'soybean', 'veg-oil']
['71', 'earn']
['74', 'earn']
['75', 'coffee']
['76', 'money-supply']
['77', 'money-supply']
['78', 'money-supply']
['80', 'money-supply']
['8

In [31]:
from nltk.corpus import reuters
import nltk
nltk.download('reuters')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Eoghan\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [32]:

def collection_stats():
    # List of documents
    documents = reuters.fileids()
    print(str(len(documents)) + " documents");
 
    train_docs = list(filter(lambda doc: doc.startswith("train"),
                        documents));
    print(str(len(train_docs)) + " total train documents");
 
    test_docs = list(filter(lambda doc: doc.startswith("test"),
                       documents));
    print(str(len(test_docs)) + " total test documents");
 
    # List of categories
    categories = reuters.categories();
    print(str(len(categories)) + " categories");
    
    print(categories)

    # Documents in a category
    category_docs = reuters.fileids("acq");
 
    # Words for a document
    document_id = category_docs[0]
    document_words = reuters.words(category_docs[0]);
    # print(document_words);  
    print(document_id)
    # Raw document
    # print(reuters.raw(document_id));

In [33]:
# create counter to summarize
categories = []
file_count = []

# count each tag's number of documents
for i in reuters.categories():
    # print("$ There are {} documents included in topic \"{}\""
    #       .format(len(reuters.fileids(i)), i))
    file_count.append(len(reuters.fileids(i)))
    categories.append(i)

# create a dataframe out of the counts
df = pd.DataFrame(
    {'categories': categories, "file_count": file_count}) \
    .sort_values('file_count', ascending=False)

In [34]:
print(df)

       categories  file_count
21           earn        3964
0             acq        2369
46       money-fx         717
26          grain         582
17          crude         578
..            ...         ...
79       sun-meal           2
28  groundnut-oil           2
70            rye           2
42        lin-oil           2
5      castor-oil           2

[90 rows x 2 columns]


In [35]:
category_filter = df.iloc[0:90, 0].values.tolist()

In [36]:
(f">>> The following categories are selected for the analysis: \
      {category_filter}")

">>> The following categories are selected for the analysis:       ['earn', 'acq', 'money-fx', 'grain', 'crude', 'trade', 'interest', 'ship', 'wheat', 'corn', 'dlr', 'money-supply', 'oilseed', 'sugar', 'coffee', 'gnp', 'veg-oil', 'gold', 'soybean', 'nat-gas', 'bop', 'livestock', 'cpi', 'reserves', 'cocoa', 'carcass', 'jobs', 'copper', 'rice', 'yen', 'cotton', 'alum', 'gas', 'iron-steel', 'ipi', 'barley', 'rubber', 'meal-feed', 'palm-oil', 'zinc', 'sorghum', 'pet-chem', 'tin', 'lead', 'silver', 'wpi', 'orange', 'strategic-metal', 'rapeseed', 'soy-meal', 'soy-oil', 'retail', 'fuel', 'hog', 'housing', 'heat', 'income', 'lumber', 'sunseed', 'lei', 'oat', 'dmk', 'tea', 'platinum', 'groundnut', 'nickel', 'rape-oil', 'l-cattle', 'sun-oil', 'coconut-oil', 'propane', 'coconut', 'potato', 'instal-debt', 'naphtha', 'jet', 'cpu', 'nzdlr', 'rand', 'nkr', 'palladium', 'palmkernel', 'copra-cake', 'dfl', 'cotton-oil', 'sun-meal', 'groundnut-oil', 'rye', 'lin-oil', 'castor-oil']"

In [37]:
doc_list = np.array(reuters.fileids(category_filter))
doc_list = doc_list[doc_list != 'training/3267']

In [38]:
test_doc = doc_list[['test' in x for x in doc_list]]
print(">>> test_doc is created with following document names: {} ...".format(test_doc[0:5]))
train_doc = doc_list[['training' in x for x in doc_list]]
print(">>> train_doc is created with following document names: {} ...".format(train_doc[0:5]))

test_corpus = [" ".join([t for t in reuters.words(test_doc[t])])
               for t in range(len(test_doc))]
print(">>> test_corpus is created, the first line is: {} ...".format(test_corpus[0][:100]))
train_corpus = [" ".join([t for t in reuters.words(train_doc[t])])
                for t in range(len(train_doc))]
print(">>> train_corpus is created, the first line is: {} ...".format(train_corpus[0][:100]))

>>> test_doc is created with following document names: ['test/14826' 'test/14828' 'test/14829' 'test/14832' 'test/14833'] ...
>>> train_doc is created with following document names: ['training/1' 'training/10' 'training/100' 'training/1000'
 'training/10000'] ...
>>> test_corpus is created, the first line is: ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPAN RIFT Mounting trade friction between the U . S . And ...
>>> train_corpus is created, the first line is: BAHIA COCOA REVIEW Showers continued throughout the week in the Bahia cocoa zone , alleviating the d ...


In [39]:
def train_token():
    for i, document in enumerate(train_corpus):
        doc_id = "train" + "__" + str(i)
        print(doc_id)
        yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(document), [doc_id])

In [40]:
def test_token():
    for i, document in enumerate(test_corpus):
        doc_id = "test" + "__" + str(i)
        print(doc_id)
        yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(document), [doc_id])

In [41]:
train_corpus_token = list()
test_corpus_token = list()

train_corpus_token = list(train_token())
test_corpus_token = list(test_token())

train__0
train__1
train__2
train__3
train__4
train__5
train__6
train__7
train__8
train__9
train__10
train__11
train__12
train__13
train__14
train__15
train__16
train__17
train__18
train__19
train__20
train__21
train__22
train__23
train__24
train__25
train__26
train__27
train__28
train__29
train__30
train__31
train__32
train__33
train__34
train__35
train__36
train__37
train__38
train__39
train__40
train__41
train__42
train__43
train__44
train__45
train__46
train__47
train__48
train__49
train__50
train__51
train__52
train__53
train__54
train__55
train__56
train__57
train__58
train__59
train__60
train__61
train__62
train__63
train__64
train__65
train__66
train__67
train__68
train__69
train__70
train__71
train__72
train__73
train__74
train__75
train__76
train__77
train__78
train__79
train__80
train__81
train__82
train__83
train__84
train__85
train__86
train__87
train__88
train__89
train__90
train__91
train__92
train__93
train__94
train__95
train__96
train__97
train__98
train__99
train__100

In [42]:
model_name = "Reuters_Doc2Vec"
result = model.train(train_corpus_token)
if result == 1:
    model.save(reader.get_models_path(), model_name)
    print("Training Complete!")

RuntimeError: cannot sort vocabulary after model weights already initialized.

In [10]:
from sklearn.preprocessing import MultiLabelBinarizer

In [14]:
documents = reuters.fileids()

In [15]:
train_docs_id = list(filter(lambda doc: doc.startswith("train"),
                            documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"),
                           documents))

In [16]:
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

In [17]:
print(train_docs[0])

BAHIA COCOA REVIEW
  Showers continued throughout the week in
  the Bahia cocoa zone, alleviating the drought since early
  January and improving prospects for the coming temporao,
  although normal humidity levels have not been restored,
  Comissaria Smith said in its weekly review.
      The dry period means the temporao will be late this year.
      Arrivals for the week ended February 22 were 155,221 bags
  of 60 kilos making a cumulative total for the season of 5.93
  mln against 5.81 at the same stage last year. Again it seems
  that cocoa delivered earlier on consignment was included in the
  arrivals figures.
      Comissaria Smith said there is still some doubt as to how
  much old crop cocoa is still available as harvesting has
  practically come to an end. With total Bahia crop estimates
  around 6.4 mln bags and sales standing at almost 6.2 mln there
  are a few hundred thousand bags still in the hands of farmers,
  middlemen, exporters and processors.
      There are doubt

In [18]:
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id)
                                  for doc_id in train_docs_id])
test_labels = mlb.transform([reuters.categories(doc_id)
                             for doc_id in test_docs_id])

In [19]:
print(train_labels)

[[0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [20]:
print(test_labels)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [21]:
def train_token():
    for i, document in enumerate(train_docs):
        doc_id = "train" + "__" + str(i)
        yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(document), [doc_id])

In [22]:
train_docs_tokens = list()
train_docs_tokens = list(train_token())

In [37]:
model_name = "Reuters_Doc2Vec"
result = model.train(train_docs_tokens)
if result == 1:
    model.save(reader.get_models_path(), model_name)
    print("Training Complete!")

Training Complete!


In [38]:
model.load(reader.get_models_path(), model_name)

In [39]:
# for document in train_docs_tokens:

In [40]:
train_topics = list()
train_docs = list()

In [41]:
def load_testing():
    """
    Use this function when a trained Doc2Vec model exists. This function assumes
    that a Doc2Vec model is already loaded into the program. The NN classifier
    is used to store the necessary topics into the program.
    """

    print("Loading Training Set")
    doc_labels = model.get_labels()
    train_topics.clear()
    train_docs.clear()

    for label in doc_labels:
        # print(label)
        train_docs.append(model.get_doc_vec(label))
        split_string = label.split("__")
        # print(split_string)

        train_topics.append(split_string[0])
        classifier.add_topic(split_string[0])
        # print(split_string[0])

    for i in range(len(train_topics)):
        train_topics[i] = classifier.get_topic_vector(train_topics[i])
        print(train_topics[i])

In [42]:
load_testing()

Loading Training Set
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]

In [43]:
classifier.train(np.array(train_docs, ndmin=2), np.array(train_topics, ndmin=2))

Training error: 1.0
