- data from [link](https://lazyprogrammer.me/course_files/deepnlp_classification_data.zip)

In [1]:
import numpy as np
from gensim.models import KeyedVectors
import os
from sklearn.metrics.pairwise import pairwise_distances
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB

In [20]:
class EmbeddingClassifier :
    def __init__(self, embedding_type = 'word2vec') :
        self.embedding = None
        self.type = None
        self.stop_words = [line for line in open(os.path.join('..', 'NLP', 'data', 'stopwords.txt'), encoding='utf-8')]
        self.stop_words = set((stopwords.words('english') + self.stop_words))
        self.model = None
        self.word_net_lemmatizer = WordNetLemmatizer()
        self.type = embedding_type
        if embedding_type == 'word2vec' :
            self.embedding = KeyedVectors.load_word2vec_format(
                os.path.join('data', 'GoogleNews-vectors-negative300.bin'),
                binary=True
            )
            
            
        else :
            self.embedding = {}
            with open(os.path.join('data', 'glove.6B', 'glove.6B.50d.txt'), encoding='utf-8') as f:
                for line in f :
                    split = line.split()
                    self.embedding[split[0]] = np.fromiter(split[1:], dtype=np.float32)

    
    def my_tokenize(self, s) :
        #s = s.lower()
        #s = re.sub(r"\d", "", s)
        #s = s.translate(str.maketrans('', '', string.punctuation))
        #tokens = nltk.tokenize.word_tokenize(s)
        #tokens = [self.word_net_lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
        tokens = s.split()
        return tokens
    
    def fit(self, X, Y, classifier = None) :
        '''
        Trains on list of document strings
        -------------------------------
        params :
        
        X : List of documents(strings)
        Y : List of correspoding outputs
        '''
        print('Training...')
        self.Y2idx = {}
        i = 0
        Y_vectorized = []
        for y in Y:
            if y not in self.Y2idx :
                self.Y2idx[y] = i
                i += 1
            Y_vectorized.append(self.Y2idx[y])
        
        fv_means = []
        for document in X :
            x = document.lower()
            tokens = self.my_tokenize(x)
            mean = 0
            N = 0
            for token in tokens :
                if token in self.embedding :
                    mean += self.embedding[token]
                    N += 1
            mean /= N
            fv_means.append(mean)
        fv_means = np.array(fv_means)
        print(f'Post processed input shape : {fv_means.shape}')
        
        if classifier == 'randomforest' :
            print('Using Random Forest Classifier')
            self.model = RandomForestClassifier()
        elif classifier == 'decisiontree' :
            print('Using Decision Tree Classifier')
            self.model = DecisionTreeClassifier()
        elif classifier == 'logisticregression' :
            print('Using Logistic Regression Classifier')
            self.model = LogisticRegression(max_iter=5000)
        else :
            print('Using Extra Trees Classifier')
            self.model = ExtraTreesClassifier()
        
        self.model.fit(fv_means, Y_vectorized)
        
    def score(self, X_test, Y_test) :
        Y_vectorized = []
        for y in Y_test:
            if y not in self.Y2idx :
                self.Y2idx[y] = i
                i += 1
            Y_vectorized.append(self.Y2idx[y])
        
        fv_means = []
        for document in X_test :
            x = document.lower()
            tokens = self.my_tokenize(x)
            mean = 0
            N = 0
            for token in tokens :
                if token in self.embedding :
                    mean += self.embedding[token]
                    N += 1
            mean /= N
            fv_means.append(mean)
        fv_means = np.array(fv_means)
        return self.model.score(fv_means, Y_vectorized)
    
    def predict(self, X_test) :
        fv_means = []
        for document in X_test :
            x = document.lower()
            tokens = self.my_tokenize(x)
            mean = 0
            N = 0
            for token in tokens :
                if token in self.embedding :
                    mean += self.embedding[token]
                    N += 1
            mean /= N
            fv_means.append(mean)
        fv_means = np.array(fv_means)
        
        predictions = self.model.predict(fv_means)
        idx2Y = dict((v,k) for k,v in self.Y2idx.items())
        return [idx2Y[prediction] for prediction in predictions]
            

In [3]:
X = []
Y = []
with open(os.path.join('data', 'deepnlp_classification_data', 'r8-train-all-terms.txt'), encoding = 'utf-8') as f :
    for line in f :
        y, x = line.split('\t')
        Y.append(y)
        X.append(x)
print(f'Found {len(Y)} training samples')

Found 5485 training samples


In [4]:
X_test = []
Y_test = []
with open(os.path.join('data', 'deepnlp_classification_data', 'r8-test-all-terms.txt'), encoding = 'utf-8') as f :
    for line in f :
        y, x = line.split('\t')
        Y_test.append(y)
        X_test.append(x)
print(f'Found {len(Y_test)} testing samples')

Found 2189 testing samples


In [21]:
model = EmbeddingClassifier(embedding_type='glove')

In [27]:
model.fit(X, Y, classifier='extratrees')

Training...
Post processed input shape : (5485, 50)
Using Extra Trees Classifier


In [28]:
model.score(X, Y)

0.9992707383773929

In [29]:
model.score(X_test, Y_test)

0.9337597076290544

In [30]:
while True :
    random_x_idx = np.random.choice(len(X_test))
    predicted_Y = model.predict([X_test[random_x_idx]])[0]
    true_Y = Y_test[random_x_idx]
    print(f'True : {true_Y} Predicted : {predicted_Y}')
    print(f'Text : {X_test[random_x_idx]}')
    continue_ = input('Continue again?')
    if continue_.lower() != 'y' :
        break

True : acq Predicted : acq
Text : dominion textile calls report of bid for burlington rumor 



Continue again? y


True : acq Predicted : acq
Text : thomson grand public takes over thorn emi s audiovisual division thomson 



Continue again? n
