In [162]:
from gensim.models.doc2vec import Doc2Vec
import logging
from os.path import join
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
import os 
import gensim
from gensim.utils import simple_preprocess
from tkinter import filedialog
from tkinter import *
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

class D2V:

    def __init__(self):
        
        self.__model = Doc2Vec(dm=1,
                               vector_size=40,
                               min_count=5,
                               epochs=40,
                               workers=8)

    
    def train(self, train_corpus):
        self.__model.build_vocab(train_corpus)
        self.__model.train(train_corpus, total_examples=self.__model.corpus_count, epochs=self.__model.epochs)
        return 1

    def save(self, folder_path, filename):
        self.__model.save(join(folder_path, filename))

    def load(self, folder_path, filename):
        self.__model = Doc2Vec.load(join(folder_path, filename))

    def infer_doc(self, doc):
        return self.__model.infer_vector(doc)

    def get_vector(self, id):
        return self.__model.docvecs[id]

    def get_similar(self, doc):
        return self.__model.docvecs.most_similar([doc])

    def get_labels(self):
        """
        Returns the labels of all documents within the Doc2Vec model
        """
        return list(self.__model.docvecs.doctags.keys())

    def get_doc_vec(self, identifier: str):
        return self.__model.docvecs[identifier]


In [163]:


class FileReader:

    def __init__(self):
        self.__models_paths = "D:\\workspace\\Text-Classification-Using-Neural-Networks\\src\\test\\models\\d2v_models"
        self.__training_path = "D:\\workspace\\Text-Classification-Using-Neural-Networks\\src\\test\\training"
        self.__testing_path = "D:\\workspace\\Text-Classification-Using-Neural-Networks\\src\\test\\testing"

    def read_corpus_train(self):

        print("READING CORPUS")
        topics = dirs = os.listdir(self.__training_path)
        
        # Go through each foler in dataset; where folder_name = topic of documents in that folder
        for topic in dirs:
            print("Current topic: {}".format(topic))
            curr_path = os.path.join(self.__training_path, topic)
            docs = os.listdir(curr_path)

            # Create TaggedDocument objects for each document in that folder
            for i, document in enumerate(docs):
                curr_doc_write = os.path.join(curr_path, document)
                with open(curr_doc_write, mode="r", encoding="utf-8") as file:
                    content = file.read()

                doc_id = topic + "__" + str(i) 
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(content), [doc_id])

    def get_models_path(self): return self.__models_paths
    def get_training_path(self): return self.__training_path
    def get_testing_path(self): return self.__testing_path

    def process_new_doc(self, filename):
        curr_doc_write = os.path.join(self.__testing_path, filename)
        with open(curr_doc_write, mode="r", encoding="utf-8") as file:
                    content = file.read()
        return gensim.utils.simple_preprocess(content)


In [1]:
class NN:

    def __init__(self):
        self.__topics = list()
        self.clf = MLPClassifier(activation='logistic', 
                                 learning_rate="adaptive", 
                                 learning_rate_init=0.0001,
                                 solver="adam",
                                 max_iter=500)

    def train(self, x: np.array, y: np.array) -> None:
        result = self.clf.fit(x, y)
        print("Training error: {}".format(result.score(x, y)))

    def predict_probability(self, x: np.array) -> np.array:
        val = self.clf.predict_proba(x)
    
        return val

    def predict(self, x: np.array) -> np.array:
        val = self.clf.predict(x)
    
        return val
    
    def get_topics(self): return self.__topics

    def add_topic(self, t: str):
        if t not in self.__topics:
            self.__topics.append(t)

    def get_topic_vector(self, t: str):
        topic_vec = list()
        for topic in self.__topics:
            if t == topic:
                topic_vec.append(1)
            else:
                topic_vec.append(0)

        return topic_vec

NameError: name 'np' is not defined

In [228]:
reader = FileReader()
model = D2V()
classifier = NN()

model_name = "model3_epochs40_vecsize40.d2v"

# All contents of the lists below should be vectorized
train_topics = list()
train_docs = list() 

test_topics = list()
test_docs = list() 

In [166]:
def load_trainset():
    """
    Use this function when a trained Doc2Vec model exists. This function assumes
    that a Doc2Vec model is already loaded into the program. The NN classifier
    is used to store the necessary topics into the program.
    """
    doc_labels = model.get_labels()
    train_topics.clear()
    train_docs.clear()
    
    for label in doc_labels:
        train_docs.append(model.get_doc_vec(label))
        
        split_string = label.split("__")
        
        train_topics.append(split_string[0])
        classifier.add_topic(split_string[0])
    
    for i in range(len(train_topics)):
        train_topics[i] = classifier.get_topic_vector(train_topics[i])

In [167]:
def load_testset():
    
    test_topics.clear()
    test_docs.clear()
    
    print("Loading test dataset")
    topics = classifier.get_topics()
    
    for topic in topics:
        print("Current topic: %s" % topic)
        file_location = os.path.join(reader.get_testing_path(), topic)
        files = os.listdir(file_location)
        
        for file in files:
            with open(os.path.join(file_location, file), mode="r", encoding="utf-8") as file:
                    content = file.read()
            cleaned_doc = simple_preprocess(content)
            
            test_topics.append(topic)
            test_docs.append(model.infer_doc(cleaned_doc))
    
    for i in range(len(test_topics)):
        test_topics[i] = classifier.get_topic_vector(test_topics[i])
    
    print("Finished loading test set")

In [168]:
def train_d2v():
    train_corpus = list(reader.read_corpus_train())
    
    r = model.train(train_corpus)
    if r == 1:
        model.save(reader.get_models_path(), model_name)
        print("Doc2Vec training complete & saved!")

In [229]:
train_d2v() 
#model.load(reader.get_models_path(), model_name)
load_trainset()

READING CORPUS
Current topic: BUSINESS
Current topic: POLITICS
Current topic: SCIENCE
Current topic: SPORTS
Current topic: TECH
Current topic: TRAVEL


2019-02-23 13:31:10,498 : INFO : collecting all words and their counts
2019-02-23 13:31:10,499 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2019-02-23 13:31:11,491 : INFO : PROGRESS: at example #10000, processed 6011654 words (6064244/s), 72197 word types, 10000 tags
2019-02-23 13:31:12,379 : INFO : PROGRESS: at example #20000, processed 11062429 words (5688594/s), 114910 word types, 20000 tags
2019-02-23 13:31:12,634 : INFO : collected 123821 word types and 22693 unique tags from a corpus of 22693 examples and 12453145 words
2019-02-23 13:31:12,634 : INFO : Loading a fresh vocabulary
2019-02-23 13:31:12,738 : INFO : effective_min_count=5 retains 48473 unique words (39% of original 123821, drops 75348)
2019-02-23 13:31:12,739 : INFO : effective_min_count=5 leaves 12326879 word corpus (98% of original 12453145, drops 126266)
2019-02-23 13:31:12,870 : INFO : deleting the raw counts dictionary of 123821 items
2019-02-23 13:31:12,873 : INFO : sample=0.001

2019-02-23 13:31:45,658 : INFO : EPOCH 5 - PROGRESS: at 78.54% examples, 1560181 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:31:46,659 : INFO : EPOCH 5 - PROGRESS: at 94.27% examples, 1556881 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:31:46,988 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-23 13:31:46,989 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-23 13:31:46,991 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-23 13:31:46,993 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-23 13:31:46,995 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-23 13:31:46,999 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-23 13:31:47,006 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-23 13:31:47,014 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-23 13:31:47,015 : INFO : EPOCH - 5

2019-02-23 13:32:18,353 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-23 13:32:18,354 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-23 13:32:18,357 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-23 13:32:18,359 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-23 13:32:18,367 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-23 13:32:18,373 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-23 13:32:18,374 : INFO : EPOCH - 10 : training on 12453145 raw words (9927035 effective words) took 6.2s, 1590085 effective words/s
2019-02-23 13:32:19,380 : INFO : EPOCH 11 - PROGRESS: at 14.61% examples, 1691674 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:32:20,385 : INFO : EPOCH 11 - PROGRESS: at 28.63% examples, 1713033 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:32:21,388 : INFO : EPOCH 11 - PROGRESS: at 45.52% examples, 1631320 

2019-02-23 13:32:49,893 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-23 13:32:49,894 : INFO : EPOCH - 15 : training on 12453145 raw words (9926794 effective words) took 6.2s, 1589830 effective words/s
2019-02-23 13:32:50,901 : INFO : EPOCH 16 - PROGRESS: at 14.11% examples, 1641633 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:32:51,903 : INFO : EPOCH 16 - PROGRESS: at 28.18% examples, 1681117 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:32:52,903 : INFO : EPOCH 16 - PROGRESS: at 43.82% examples, 1594523 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:32:53,908 : INFO : EPOCH 16 - PROGRESS: at 63.21% examples, 1563736 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:32:54,909 : INFO : EPOCH 16 - PROGRESS: at 78.24% examples, 1559151 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:32:55,911 : INFO : EPOCH 16 - PROGRESS: at 93.75% examples, 1553835 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:32:56,257 : INFO : worker thread finished; awaiting finish 

2019-02-23 13:33:24,427 : INFO : EPOCH 21 - PROGRESS: at 42.07% examples, 1555189 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:33:25,430 : INFO : EPOCH 21 - PROGRESS: at 61.90% examples, 1534650 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:33:26,436 : INFO : EPOCH 21 - PROGRESS: at 77.14% examples, 1536908 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:33:27,440 : INFO : EPOCH 21 - PROGRESS: at 92.53% examples, 1533674 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:33:27,861 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-23 13:33:27,865 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-23 13:33:27,870 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-23 13:33:27,873 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-23 13:33:27,876 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-23 13:33:27,879 : INFO : worker thread finished; awaiting finish of 2 mor

2019-02-23 13:33:59,478 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-23 13:33:59,485 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-23 13:33:59,486 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-23 13:33:59,487 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-23 13:33:59,490 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-23 13:33:59,495 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-23 13:33:59,502 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-23 13:33:59,508 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-23 13:33:59,509 : INFO : EPOCH - 26 : training on 12453145 raw words (9927550 effective words) took 6.3s, 1573036 effective words/s
2019-02-23 13:34:00,513 : INFO : EPOCH 27 - PROGRESS: at 14.47% examples, 1682643 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:34

2019-02-23 13:34:30,788 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-23 13:34:30,795 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-23 13:34:30,803 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-23 13:34:30,803 : INFO : EPOCH - 31 : training on 12453145 raw words (9927125 effective words) took 6.2s, 1588933 effective words/s
2019-02-23 13:34:31,811 : INFO : EPOCH 32 - PROGRESS: at 14.72% examples, 1701001 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:34:32,813 : INFO : EPOCH 32 - PROGRESS: at 28.18% examples, 1680616 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:34:33,820 : INFO : EPOCH 32 - PROGRESS: at 45.97% examples, 1639789 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:34:34,822 : INFO : EPOCH 32 - PROGRESS: at 64.74% examples, 1604423 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:34:35,823 : INFO : EPOCH 32 - PROGRESS: at 80.11% examples, 1594792 words/s, in_qsize 15, out_qsize 0
2019-02

2019-02-23 13:35:03,290 : INFO : EPOCH 37 - PROGRESS: at 14.33% examples, 1663022 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:35:04,299 : INFO : EPOCH 37 - PROGRESS: at 28.44% examples, 1689528 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:35:05,303 : INFO : EPOCH 37 - PROGRESS: at 46.34% examples, 1644330 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:35:06,314 : INFO : EPOCH 37 - PROGRESS: at 65.29% examples, 1616380 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:35:07,316 : INFO : EPOCH 37 - PROGRESS: at 80.69% examples, 1605773 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:35:08,329 : INFO : EPOCH 37 - PROGRESS: at 96.80% examples, 1590294 words/s, in_qsize 15, out_qsize 0
2019-02-23 13:35:08,497 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-23 13:35:08,498 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-23 13:35:08,499 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-23 13:35:08,501 : INFO 

Doc2Vec training complete & saved!


In [230]:
load_testset()

Loading test dataset
Current topic: BUSINESS
Current topic: POLITICS
Current topic: SCIENCE
Current topic: SPORTS
Current topic: TECH
Current topic: TRAVEL
Finished loading test set


# Get accuracy of Classifier

In [231]:
classifier = NN()
classifier.train(np.array(train_docs, ndmin=2), np.array(train_topics, ndmin=2))
print("Training complete")

Training error: 0.611774556030494
Training complete




In [232]:
test_guesses = classifier.predict(np.array(test_docs, ndmin=2))

In [173]:
np.array(test_topics, ndmin=2)

array([[1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1]])

In [233]:
metrics.accuracy_score(np.array(test_topics, ndmin=2), test_guesses)

0.7279973430753902

# Predicting unseen document

In [252]:
unseen_document = reader.process_new_doc("unseen_doc.txt")
unseen_document = model.infer_doc(unseen_document)

In [253]:
results = classifier.predict_probability(np.array(unseen_document, ndmin=2))
results = results.flatten()
ts = ["BUSINESS", "POLITICS", "SCIENCE", "SPORTS", "TECH", "TRAVEL"]
for i, t in enumerate(ts):
    print("{}: {}".format(t, results[i] * 100))

BUSINESS: 22.259700268344893
POLITICS: 1.0606703990225224
SCIENCE: 26.300804590187067
SPORTS: 0.90285743159752
TECH: 10.879578648829149
TRAVEL: 13.441591196610672
