In [492]:
import nltk, enum, spacy
import pandas as pd
import numpy as np
nltk.download('stopwords')

from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from spacy.lang.en import English
from spacy.tokens import Doc
from datetime import datetime
from collections import defaultdict

[nltk_data] Downloading package stopwords to
[nltk_data]     /data/home/kegarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [493]:
STOPWORDS = set(stopwords.words('english'))
ALLOWED_POSTAGS = ['NOUN', 'ADJ', 'VERB', 'ADV']

class Sentiments(enum.Enum):
    POS = 'POS'
    NEG = 'NEG'

#### nlp = spacy.load('en_core_web_trf') # cosa rara
    
# nlp = spacy.load('en_core_web_sm')
# nlp = spacy.load('en_core_web_trf')
# nlp = spacy.load('en_core_web_md')
nlp = spacy.load('en_core_web_lg')
def custom_tokenizer(text):
    return Doc(nlp.vocab, text.split(' '))
# nlp.tokenizer = custom_tokenizer   

In [497]:
class Model:
    # not necesarry but just as a 'fyi'
    raw_data = pd.DataFrame() # constructor
    data_classes = {
        'POS': {
            'sentences': [] # array of strings
            , 'words': [] # array of arrays, each array contains each sentence splitted
            , 'words_without_stopwords': [] # same as words but without stopwords
            , 'words_1d': [] # 1d array of words
            , 'lemma': []
            , 'bow': None
            , 'ggram': None
        }
        , 'NEG': {}
    }
    stopwords = STOPWORDS # default if not given
    allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
    
    def __init__(self, df, steps, stopwords=STOPWORDS, min_count=5, threshold=10, allowed_postags=ALLOWED_POSTAGS):
        # pandas dataframe?
        self.stopwords = stopwords
        self.raw_data = df
        self.ngram = {'min_count': min_count, 'threshold': threshold}
        self.steps = steps
        self.allowed_postags = allowed_postags
    
    def fit(self):

        self.data_classes = {sentiment.value: {'sentences': self.raw_data[self.raw_data['sentiment'] == sentiment.value]['review'].values.tolist()} for sentiment in Sentiments}
        for sentiment in self.data_classes.keys():
            print(f'{datetime.now()} {sentiment} start')
            percentage = len(self.data_classes[sentiment]['sentences'])/len(self.raw_data.index)
            sentences = self.data_classes[sentiment]['sentences']
            print(f'{datetime.now()} sentences_as_words')
            result = self.sentences2words(sentences)
            
            self.data_classes[sentiment]['percentage'] = percentage
            self.data_classes[sentiment]['sentences_as_words'] = result
            
            for step in self.steps:
                print(f'{datetime.now()} {step}')
                if step == 'remove_stopwords':
                    result = self.remove_stopwords(result)
                    self.data_classes[sentiment]['words_without_stopwords'] = result
                elif step == 'lemmatization':
                    result = self.lemmatization(result)
                    self.data_classes[sentiment]['lemmas'] = result
                elif step == 'ngram':
                    ngram_model = self.train_ngrams(result, min_count=self.ngram['min_count'], threshold=self.ngram['threshold'])
                    self.data_classes[sentiment]['ngram_model'] = ngram_model
                    
                    result = self.create_ngrams(ngram_model, result)
                    self.data_classes[sentiment]['ngrams'] = result
                else:
                    print(f'instruction not found: {step}')
            
            words = self.array2dto1d(result)
            self.data_classes[sentiment]['words'] = words
            
        print(f'{datetime.now()} probs')
            
        all_words = []
        for sentiment in self.data_classes.keys():
            all_words.append(self.data_classes[sentiment]['words'])
        self.dictionary = Dictionary(all_words)
        
        for sentiment in self.data_classes.keys():
            self.data_classes[sentiment]['bow'] = self.dictionary.doc2bow(self.data_classes[sentiment]['words'])
            self.data_classes[sentiment]['total_length'] = len(self.data_classes[sentiment]['words']) + len(self.dictionary)
            word_probs = defaultdict(lambda: np.log(1/self.data_classes[sentiment]['total_length'])) # default value
            for id, count in self.data_classes[sentiment]['bow']:
                word_probs[self.dictionary[id]] = np.log((count + 1)/self.data_classes[sentiment]['total_length']) # {'word': prob}
            self.data_classes[sentiment]['word_probs'] = word_probs
        print(f'{datetime.now()} end')
            
    def predict(self, sentence):
        sentences = [sentence]
        probs = {}
        for sentiment in self.data_classes.keys():
            result = self.sentences2words(sentences)
            
            for step in self.steps:
                print(f'{datetime.now()} {step}')
                if step == 'remove_stopwords':
                    result = self.remove_stopwords(result)
                elif step == 'lemmatization':
                    result = self.lemmatization(result)
                elif step == 'ngram':
                    if 'ngram_model' in self.data_classes[sentiment]:                    
                        result = self.create_ngrams(self.data_classes[sentiment]['ngram_model'], result)
                    else:
                        print(f'no ngram model found for {sentiment}')
                else:
                    print(f'instruction not found: {step}')
                print(f'{datetime.now()} {result}')
            
            prob_values = []
            for one_row in result: # remember we added the sentence to an array
                for word in one_row:
                    prob_values.append(self.data_classes[sentiment]['word_probs'][word])
            prob_values.append(np.log(self.data_classes[sentiment]['percentage']))
            probs[sentiment] = {'prob': sum(prob_values), 'probs': prob_values}
        return probs           
    
    def sentences2words(self, sentences):
        words = []
        for sentence in sentences:
            words.append(simple_preprocess(sentence, deacc=True))
        return words
        
    def remove_stopwords(self, list_of_list_of_words):
        """receives a list of list of words [['abc', 'abc', ...], ...] """
        words = []
        for sentence_as_words in list_of_list_of_words:
            words.append([word for word in sentence_as_words if word not in self.stopwords])
        return words
            
    def lemmatization(self, list_of_list_of_words):
        words = []        
        for sentence_as_words in list_of_list_of_words:
            doc = nlp(' '.join(sentence_as_words))
            words.append([token.lemma_ for token in doc if token.pos_ in self.allowed_postags ])
        return words
    
    def array2dto1d(self, array2d):
        result = []
        for array1d in array2d:
            result.extend(array1d)
        return result
            
    def train_ngrams(self, list_of_list_of_words, min_count=5, threshold=10):
        bigram = Phrases(list_of_list_of_words, min_count=min_count, threshold=threshold)
        bigram_mod = Phraser(bigram)
        return bigram_mod
    
    def create_ngrams(self, ngram_model, list_of_list_of_words):
        return list(ngram_model[list_of_list_of_words])
        
        #dictionary.doc2idx(['abysmal', 'abuse'])
        

POS|DESCRIPTION|EXAMPLES
---|---|---
ADJ|adjective|*big, old, green, incomprehensible, first*
ADP|adposition|*in, to, during*
ADV|adverb|*very, tomorrow, down, where, there*
AUX|auxiliary|*is, has (done), will (do), should (do)*
CONJ|conjunction|*and, or, but*
CCONJ|coordinating conjunction|*and, or, but*
DET|determiner|*a, an, the*
INTJ|interjection|*psst, ouch, bravo, hello*
NOUN|noun|*girl, cat, tree, air, beauty*
NUM|numeral|*1, 2017, one, seventy-seven, IV, MMXIV*
PART|particle|*’s, not,*
PRON|pronoun|*I, you, he, she, myself, themselves, somebody*
PROPN|proper noun|*Mary, John, London, NATO, HBO*
PUNCT|punctuation|*., (, ), ?*
SCONJ|subordinating conjunction|*if, while, that*
SYM|symbol|*$, %, §, ©, +, −, ×, ÷, =, :), *
VERB|verb|*run, runs, running, eat, ate, eating*
X|other|*sfpksdpsxmsa*
SPACE|space

In [495]:
steps = [
    'remove_stopwords',
    'lemmatization',
    'ngram'
]
# train = pd.read_csv('data/small.csv')
train = pd.read_csv('data/train.csv')
# ALLOWED_POSTAGS=['NOUN', 'ADJ', 'VERB', 'ADV']
train_model = Model(train, steps, min_count=5, threshold=10, allowed_postags=ALLOWED_POSTAGS))

In [496]:
train_model.fit()

2022-07-01 07:11:52.378834 POS start
2022-07-01 07:11:52.379073 sentences_as_words
2022-07-01 07:11:56.265525 remove_stopwords
2022-07-01 07:11:56.480715 lemmatization
2022-07-01 07:15:01.589236 ngram
2022-07-01 07:15:04.101109 NEG start
2022-07-01 07:15:04.101409 sentences_as_words
2022-07-01 07:15:04.786313 remove_stopwords
2022-07-01 07:15:04.830509 lemmatization
2022-07-01 07:15:31.209033 ngram
2022-07-01 07:15:31.664149 probs
2022-07-01 07:15:32.359753 end


In [498]:
train_model.predict('the hotel was trash')

2022-07-01 07:16:23.488100 remove_stopwords
2022-07-01 07:16:23.488364 [['hotel', 'trash']]
2022-07-01 07:16:23.488411 lemmatization
2022-07-01 07:16:23.504152 [['hotel', 'trash']]
2022-07-01 07:16:23.504675 ngram
2022-07-01 07:16:23.504801 [['hotel', 'trash']]
2022-07-01 07:16:23.504968 remove_stopwords
2022-07-01 07:16:23.505582 [['hotel', 'trash']]
2022-07-01 07:16:23.505619 lemmatization
2022-07-01 07:16:23.515265 [['hotel', 'trash']]
2022-07-01 07:16:23.515356 ngram
2022-07-01 07:16:23.515439 [['hotel', 'trash']]


{'POS': {'prob': -15.303188691058208,
  'probs': [-3.786545695334639, -11.387073084546639, -0.1295699111769321]},
 'NEG': {'prob': -15.52823752827241,
  'probs': [-4.057827695896978, -9.362789605163684, -2.1076202272117475]}}

In [447]:
[token.lemma_ for token in nlp('hello world')  if token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV'] ]

['world']