In [609]:
import nltk, enum, spacy
import pandas as pd
import numpy as np
nltk.download('stopwords')

from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from spacy.lang.en import English
from spacy.tokens import Doc
from datetime import datetime
from collections import defaultdict

[nltk_data] Downloading package stopwords to
[nltk_data]     /data/home/kegarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [610]:
STOPWORDS = set(stopwords.words('english'))
ALLOWED_POSTAGS = ['NOUN', 'ADJ', 'VERB', 'ADV', 'PART']
NLP_MODELS = {
    'en_core_web_trf': spacy.load('en_core_web_trf'),    
    'en_core_web_sm': spacy.load('en_core_web_sm'),
    'en_core_web_md': spacy.load('en_core_web_md'),
    'en_core_web_lg': spacy.load('en_core_web_lg')
}

class Sentiments(enum.Enum):
    POS = 'POS'
    NEG = 'NEG'

In [612]:
class Model:
    # not necesarry but just as a 'fyi'
    raw_data = pd.DataFrame() # constructor
    data_classes = {
        'POS': {
            'sentences': [] # array of strings
            , 'words': [] # array of arrays, each array contains each sentence splitted
            , 'words_without_stopwords': [] # same as words but without stopwords
            , 'words_1d': [] # 1d array of words
            , 'lemma': []
            , 'bow': None
            , 'ggram': None
        }
        , 'NEG': {}
    }
    stopwords = STOPWORDS # default if not given
    allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
    
    def __init__(self, df, steps, nlp_model = 'en_core_web_lg', stopwords=STOPWORDS, ngrams=2, min_count=5, threshold=10, allowed_postags=ALLOWED_POSTAGS, debug = False):
        # pandas dataframe?
        self.stopwords = stopwords
        self.raw_data = df
        self.ngram = {'min_count': min_count, 'threshold': threshold, 'ngrams': ngrams}
        self.steps = steps
        self.allowed_postags = allowed_postags
        self.debug = debug
        if nlp_model not in NLP_MODELS:
            nlp_model = 'en_core_web_lg'
        self.nlp = NLP_MODELS[nlp_model]        
    
    def fit(self):
        self.data_classes = {sentiment.value: {'sentences': self.raw_data[self.raw_data['sentiment'] == sentiment.value]['review'].values.tolist()} for sentiment in Sentiments}
        for sentiment in self.data_classes.keys():
            print(f'{datetime.now()} {sentiment} start')
            percentage = len(self.data_classes[sentiment]['sentences'])/len(self.raw_data.index)
            sentences = self.data_classes[sentiment]['sentences']
            print(f'{datetime.now()} sentences_as_words')
            result = self.sentences2words(sentences)
            
            self.data_classes[sentiment]['percentage'] = percentage
            if (self.debug):
                self.data_classes[sentiment]['sentences_as_words'] = result
            
            for step in self.steps:
                print(f'{datetime.now()} {step}')
                if step == 'remove_stopwords':
                    result = self.remove_stopwords(result)
                    if (self.debug):
                        self.data_classes[sentiment]['words_without_stopwords'] = result
                elif step == 'lemmatization':
                    result = self.lemmatization(result)
                    if (self.debug):
                        self.data_classes[sentiment]['lemmas'] = result
                elif step == 'ngram':
                    ngram_model = self.train_ngrams(result, ngrams=self.ngram['ngrams'], min_count=self.ngram['min_count'], threshold=self.ngram['threshold'])
                    if len(ngram_model)>0:
                        self.data_classes[sentiment]['ngram_model'] = ngram_model

                        result = self.create_ngrams(ngram_model, result)
                        if (self.debug):
                            self.data_classes[sentiment]['ngrams'] = result
                    else:
                        print(f'{datetime.now()}ngram not done: {self.ngram["ngram"]}')
                else:
                    print(f'{datetime.now()}instruction not found: {step}')
            
            words = self.array2dto1d(result)
            self.data_classes[sentiment]['words'] = words
            
            # if not debug remove sentences 
            if (not self.debug):
                del self.data_classes[sentiment]['sentences']
            
        print(f'{datetime.now()} probs')
            
        all_words = []
        for sentiment in self.data_classes.keys():
            all_words.append(self.data_classes[sentiment]['words'])
        self.dictionary = Dictionary(all_words)
        
        for sentiment in self.data_classes.keys():
            self.data_classes[sentiment]['bow'] = self.dictionary.doc2bow(self.data_classes[sentiment]['words'])
            self.data_classes[sentiment]['total_length'] = len(self.data_classes[sentiment]['words']) + len(self.dictionary)
            word_probs = defaultdict(lambda: np.log(1/self.data_classes[sentiment]['total_length'])) # default value
            for id, count in self.data_classes[sentiment]['bow']:
                word_probs[self.dictionary[id]] = np.log((count + 1)/self.data_classes[sentiment]['total_length']) # {'word': prob}
            self.data_classes[sentiment]['word_probs'] = word_probs

            # if not debug, remove words
            if (not self.debug):
                del self.data_classes[sentiment]['words']
                del self.data_classes[sentiment]['bow']
        print(f'{datetime.now()} end')
        
    def predict_list(self, list_of_sentences):
        results = []
        for sentence in list_of_sentences:
            results.append({'sentence': sentence, **self.predict(sentence)})
        return results
            
    def predict(self, sentence):
        sentences = [sentence]
        probs = {}
        selected = None
        current_value_selected = float('-inf')
        for sentiment in self.data_classes.keys():
            result = self.sentences2words(sentences)
            
            for step in self.steps:
                print(f'{datetime.now()} {step}')
                if step == 'remove_stopwords':
                    result = self.remove_stopwords(result)
                elif step == 'lemmatization':
                    result = self.lemmatization(result)
                elif step == 'ngram':
                    if 'ngram_model' in self.data_classes[sentiment] and len(self.data_classes[sentiment]['ngram_model'])>0:                    
                        result = self.create_ngrams(self.data_classes[sentiment]['ngram_model'], result)
                    else:
                        print(f'no ngram model found for {sentiment}')
                else:
                    print(f'instruction not found: {step}')
                print(f'{datetime.now()} {result}')
            
            prob_values = []
            for one_row in result: # remember we added the sentence to an array
                for word in one_row:
                    prob_values.append(self.data_classes[sentiment]['word_probs'][word])
            prob_values.append(np.log(self.data_classes[sentiment]['percentage']))
            probs[sentiment] = {'prob': sum(prob_values), 'probs': prob_values}
            if (probs[sentiment]['prob'] > current_value_selected):
                current_value_selected = probs[sentiment]['prob'] 
                selected = sentiment
        probs['selected'] = selected
        return probs           
    
    def sentences2words(self, sentences):
        words = []
        for sentence in sentences:
            words.append(simple_preprocess(sentence, deacc=True))
        return words
        
    def remove_stopwords(self, list_of_list_of_words):
        """receives a list of list of words [['abc', 'abc', ...], ...] """
        words = []
        for sentence_as_words in list_of_list_of_words:
            words.append([word for word in sentence_as_words if word not in self.stopwords])
        return words
            
    def lemmatization(self, list_of_list_of_words):
        words = []        
        for sentence_as_words in list_of_list_of_words:
            doc = self.nlp(' '.join(sentence_as_words))
            words.append([token.lemma_ for token in doc if token.pos_ in self.allowed_postags ])
        return words
    
    def array2dto1d(self, array2d):
        result = []
        for array1d in array2d:
            result.extend(array1d)
        return result
            
    def train_ngrams(self, list_of_list_of_words, ngrams=2, min_count=5, threshold=10):
        if ngrams < 2:
            ngrams = 2
        result = list_of_list_of_words
        ngram_models = []
        for i in range(ngrams-1):
            ngram_phraser = Phrases(result, min_count=min_count, threshold=threshold)
            ngram_model = Phraser(ngram_phraser)
            ngram_models.append(ngram_model)
            result = list(ngram_model[result])
            
        return ngram_models
    
    def create_ngrams(self, ngram_model_array, list_of_list_of_words):
        """ngram_model = []"""
        result = list_of_list_of_words
        for ngram_model in ngram_model_array:
            result = list(ngram_model[result])
        return result
        
        #dictionary.doc2idx(['abysmal', 'abuse'])
        

POS|DESCRIPTION|EXAMPLES
---|---|---
ADJ|adjective|*big, old, green, incomprehensible, first*
ADP|adposition|*in, to, during*
ADV|adverb|*very, tomorrow, down, where, there*
AUX|auxiliary|*is, has (done), will (do), should (do)*
CONJ|conjunction|*and, or, but*
CCONJ|coordinating conjunction|*and, or, but*
DET|determiner|*a, an, the*
INTJ|interjection|*psst, ouch, bravo, hello*
NOUN|noun|*girl, cat, tree, air, beauty*
NUM|numeral|*1, 2017, one, seventy-seven, IV, MMXIV*
PART|particle|*’s, not,*
PRON|pronoun|*I, you, he, she, myself, themselves, somebody*
PROPN|proper noun|*Mary, John, London, NATO, HBO*
PUNCT|punctuation|*., (, ), ?*
SCONJ|subordinating conjunction|*if, while, that*
SYM|symbol|*$, %, §, ©, +, −, ×, ÷, =, :), *
VERB|verb|*run, runs, running, eat, ate, eating*
X|other|*sfpksdpsxmsa*
SPACE|space

In [613]:
steps = [
    'remove_stopwords',
    'lemmatization',
    'ngram'
]
train = pd.read_csv('data/small.csv')
# train = pd.read_csv('data/train.csv')
# ALLOWED_POSTAGS=['NOUN', 'ADJ', 'VERB', 'ADV']
# ngrams starts from 2
train_model = Model(train, steps
                    , debug=False
                    , nlp_model='en_core_web_lg'
                    , ngrams=4
                    , min_count=1
                    , threshold=10
                    , allowed_postags=ALLOWED_POSTAGS
                   )

In [614]:
train_model.fit()

2022-07-01 14:37:38.357396 POS start
2022-07-01 14:37:38.357760 sentences_as_words
2022-07-01 14:37:38.361191 remove_stopwords
2022-07-01 14:37:38.361433 lemmatization
2022-07-01 14:37:38.598272 ngram
2022-07-01 14:37:38.610965 NEG start
2022-07-01 14:37:38.611072 sentences_as_words
2022-07-01 14:37:38.611774 remove_stopwords
2022-07-01 14:37:38.611873 lemmatization
2022-07-01 14:37:38.649939 ngram
2022-07-01 14:37:38.653645 probs
2022-07-01 14:37:38.656103 end


In [615]:
train_model.data_classes # ['POS']['ngrams']

{'POS': {'percentage': 0.8888888888888888,
  'ngram_model': [<gensim.models.phrases.FrozenPhrases at 0x7f88183d6390>,
   <gensim.models.phrases.FrozenPhrases at 0x7f88183d61d0>,
   <gensim.models.phrases.FrozenPhrases at 0x7f88189adb38>],
  'total_length': 537,
  'word_probs': defaultdict(<function __main__.Model.fit.<locals>.<lambda>()>,
              {'absolutely': -5.5928509139489195,
               'access': -5.187385805840755,
               'agency': -5.5928509139489195,
               'agree': -5.5928509139489195,
               'also': -5.187385805840755,
               'arrive': -5.5928509139489195,
               'atmosphere': -5.5928509139489195,
               'attention': -5.5928509139489195,
               'authentic': -5.5928509139489195,
               'available': -5.5928509139489195,
               'avoid': -5.5928509139489195,
               'away': -5.5928509139489195,
               'back': -5.187385805840755,
               'basic': -5.5928509139489195,
          

In [616]:
train_model.predict('the hotel was trash')

2022-07-01 14:37:58.330204 remove_stopwords
2022-07-01 14:37:58.330310 [['hotel', 'trash']]
2022-07-01 14:37:58.330357 lemmatization
2022-07-01 14:37:58.344993 [['hotel', 'trash']]
2022-07-01 14:37:58.345086 ngram
2022-07-01 14:37:58.345229 [['hotel', 'trash']]
2022-07-01 14:37:58.345379 remove_stopwords
2022-07-01 14:37:58.345433 [['hotel', 'trash']]
2022-07-01 14:37:58.345468 lemmatization
2022-07-01 14:37:58.356144 [['hotel', 'trash']]
2022-07-01 14:37:58.356570 ngram
2022-07-01 14:37:58.356720 [['hotel', 'trash']]


{'POS': {'prob': -10.694240571313639,
  'probs': [-4.899703733388974, -5.676753802268282, -0.11778303565638351]},
 'NEG': {'prob': -12.857585001312838,
  'probs': [-4.983606621708336, -5.676753802268282, -2.1972245773362196]},
 'selected': 'POS'}

In [617]:
train_model.predict_list(["the hotel is dirty and noisy", "the hotel is clean", "hello world"])

2022-07-01 14:38:02.885949 remove_stopwords
2022-07-01 14:38:02.886050 [['hotel', 'dirty', 'noisy']]
2022-07-01 14:38:02.886094 lemmatization
2022-07-01 14:38:02.901635 [['hotel', 'dirty', 'noisy']]
2022-07-01 14:38:02.901741 ngram
2022-07-01 14:38:02.901909 [['hotel', 'dirty', 'noisy']]
2022-07-01 14:38:02.902079 remove_stopwords
2022-07-01 14:38:02.902136 [['hotel', 'dirty', 'noisy']]
2022-07-01 14:38:02.902173 lemmatization
2022-07-01 14:38:02.912638 [['hotel', 'dirty', 'noisy']]
2022-07-01 14:38:02.912742 ngram
2022-07-01 14:38:02.912869 [['hotel', 'dirty', 'noisy']]
2022-07-01 14:38:02.913034 remove_stopwords
2022-07-01 14:38:02.913088 [['hotel', 'clean']]
2022-07-01 14:38:02.913124 lemmatization
2022-07-01 14:38:02.923571 [['hotel', 'clean']]
2022-07-01 14:38:02.923661 ngram
2022-07-01 14:38:02.923777 [['hotel', 'clean']]
2022-07-01 14:38:02.923904 remove_stopwords
2022-07-01 14:38:02.923958 [['hotel', 'clean']]
2022-07-01 14:38:02.923995 lemmatization
2022-07-01 14:38:02.934030 

[{'sentence': 'the hotel is dirty and noisy',
  'POS': {'prob': -16.37099437358192,
   'probs': [-4.899703733388974,
    -5.676753802268282,
    -5.676753802268282,
    -0.11778303565638351]},
  'NEG': {'prob': -18.534338803581118,
   'probs': [-4.983606621708336,
    -5.676753802268282,
    -5.676753802268282,
    -2.1972245773362196]},
  'selected': 'POS'},
 {'sentence': 'the hotel is clean',
  'POS': {'prob': -9.694046951120121,
   'probs': [-4.899703733388974, -4.676560182074764, -0.11778303565638351]},
  'NEG': {'prob': -12.857585001312838,
   'probs': [-4.983606621708336, -5.676753802268282, -2.1972245773362196]},
  'selected': 'POS'},
 {'sentence': 'hello world',
  'POS': {'prob': -5.794536837924666,
   'probs': [-5.676753802268282, -0.11778303565638351]},
  'NEG': {'prob': -7.873978379604502,
   'probs': [-5.676753802268282, -2.1972245773362196]},
  'selected': 'POS'}]

In [551]:
[token.lemma_ for token in nlp('hello world')  if token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV'] ]

['world']