## Imports

In [1]:
import re
import nltk, enum, spacy
import pandas as pd
import numpy as np
nltk.download('stopwords')

from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from spacy.lang.en import English
from spacy.tokens import Doc
from datetime import datetime

[nltk_data] Downloading package stopwords to
[nltk_data]     /data/home/kegarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Cargar modelos de spacy
Se agregó como hiperparámetro el modelo de spacy, por lo que se cargaron 4 modelos diferentes

In [2]:
STOPWORDS = set(stopwords.words('english'))
ALLOWED_POSTAGS = ['NOUN', 'ADJ', 'VERB', 'ADV', 'PART']
NLP_MODELS = {
    'en_core_web_trf': spacy.load('en_core_web_trf'),    
    'en_core_web_sm': spacy.load('en_core_web_sm'),
    'en_core_web_md': spacy.load('en_core_web_md'),
    'en_core_web_lg': spacy.load('en_core_web_lg')
}

class Sentiments(enum.Enum):
    POS = 'POS'
    NEG = 'NEG'

## Model class

In [3]:
class Model:
    # not necesarry but just as a 'fyi'
    raw_data = pd.DataFrame() # constructor
    data_classes = {
        'POS': {
            'sentences': [] # array of strings
            , 'words': [] # array of arrays, each array contains each sentence splitted
            , 'words_without_stopwords': [] # same as words but without stopwords
            , 'words_1d': [] # 1d array of words
            , 'lemma': []
            , 'bow': None
            , 'ggram': None
        }
        , 'NEG': {}
    }
    stopwords = STOPWORDS # default if not given
    allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
    
    def __init__(self, df, steps, nlp_model = 'en_core_web_lg', stopwords=STOPWORDS, ngrams=2, min_count=5, threshold=10, allowed_postags=ALLOWED_POSTAGS, debug = False):
        self.stopwords = stopwords
        self.raw_data = df
        self.ngram = {'min_count': min_count, 'threshold': threshold, 'ngrams': ngrams}
        self.steps = steps
        self.allowed_postags = allowed_postags
        self.debug = debug
        if nlp_model not in NLP_MODELS:
            nlp_model = 'en_core_web_lg'
        self.nlp = NLP_MODELS[nlp_model]
    
    
    def fit(self):
        '''
        train model using all parameters
        '''
        # get from raw data a df with data_classes = {'POS':{'sentences':[.. , ..]}, 'NEG':{'sentences':[.. , ..]}}
        self.data_classes = {sentiment.value: {'sentences': self.raw_data[self.raw_data['sentiment'] == sentiment.value]['review'].values.tolist()} for sentiment in Sentiments}
        
        # iterate for each sentiment class
        for sentiment in self.data_classes.keys():
            print(f'{datetime.now()} {sentiment} class start')
            
            # get class percentage (ie: percentage = POS/total sentences)
            percentage = len(self.data_classes[sentiment]['sentences'])/len(self.raw_data.index)
            self.data_classes[sentiment]['percentage'] = percentage
            
            # split sentences into array of words
            print(f'{datetime.now()}   sentences_as_words')
            sentences = self.data_classes[sentiment]['sentences']
            result = self.sentences2words(sentences)
            
            if (self.debug):
                self.data_classes[sentiment]['sentences_as_words'] = result
            
            # iterate the pipeline
            for step in self.steps:
                print(f'{datetime.now()}   {step}')
                
                if step == 'remove_stopwords':
                    result = self.remove_stopwords(result)
                    if (self.debug):
                        self.data_classes[sentiment]['words_without_stopwords'] = result
                
                elif step == 'lemmatization':
                    result = self.lemmatization(result)
                    if (self.debug):
                        self.data_classes[sentiment]['lemmas'] = result
                
                elif step == 'ngram':
                    # train ngram model
                    ngram_model = self.train_ngrams(result, ngrams=self.ngram['ngrams'], min_count=self.ngram['min_count'], threshold=self.ngram['threshold'])
                    
                    if len(ngram_model)>0:
                        self.data_classes[sentiment]['ngram_model'] = ngram_model
                        
                        # apply ngram model
                        result = self.create_ngrams(ngram_model, result)
                        if (self.debug):
                            self.data_classes[sentiment]['ngrams'] = result
                    else:
                        print(f'{datetime.now()} ngram not done: {self.ngram["ngram"]}')
                
                else:
                    print(f'{datetime.now()} instruction not found: {step}')
            
            # save all words per sentiment class
            words = self.array2dto1d(result)
            self.data_classes[sentiment]['words'] = words
            
            # if not debug remove sentences 
            if (not self.debug):
                del self.data_classes[sentiment]['sentences']
            
        print(f'{datetime.now()} probs')
        
        # build word dictionary of all sentiment classes
        all_words = []
        for sentiment in self.data_classes.keys():
            all_words.append(self.data_classes[sentiment]['words'])
        self.dictionary = Dictionary(all_words)
        
        # calculate word probabilities for each sentiment class
        for sentiment in self.data_classes.keys():
            # calculate bag of words
            self.data_classes[sentiment]['bow'] = self.dictionary.doc2bow(self.data_classes[sentiment]['words'])
            
            # calculate probability of each word
            self.data_classes[sentiment]['total_length'] = len(self.data_classes[sentiment]['words']) + len(self.dictionary)
            self.data_classes[sentiment]['default_value_prop']= np.log(1/self.data_classes[sentiment]['total_length'])
            word_probs = {}
            for id, count in self.data_classes[sentiment]['bow']:
                word_probs[self.dictionary[id]] = np.log((count + 1)/self.data_classes[sentiment]['total_length']) # {'word': prob}
            self.data_classes[sentiment]['word_probs'] = word_probs

            # if not debug, remove words
            if (not self.debug):
                del self.data_classes[sentiment]['words']
                del self.data_classes[sentiment]['bow']
        print(f'{datetime.now()} end')
        
    
    def predict(self, sentence, debug=False):
        '''
        predict sentiment class of one sentence
        '''
        sentences = [sentence]
        probs = {}
        selected = None
        current_value_selected = float('-inf')
        
        # iterate for each sentiment class
        for sentiment in self.data_classes.keys():
            result = self.sentences2words(sentences)
            
            # apply pipeline to sentence
            for step in self.steps:
                if debug:
                    print(f'{datetime.now()} {step}')
                if step == 'remove_stopwords':
                    result = self.remove_stopwords(result)
                elif step == 'lemmatization':
                    result = self.lemmatization(result)
                elif step == 'ngram':
                    if 'ngram_model' in self.data_classes[sentiment] and len(self.data_classes[sentiment]['ngram_model'])>0:                    
                        result = self.create_ngrams(self.data_classes[sentiment]['ngram_model'], result)
                    else:
                        print(f'no ngram model found for {sentiment}')
                else:
                    print(f'instruction not found: {step}')
                if debug:
                    print(f'{datetime.now()} {result}')
            
            # calculate sentiment probability
            prob_values = []
            for one_row in result: # remember we added the sentence to an array
                for word in one_row:
                    prob_values.append(self.data_classes[sentiment]['word_probs'].get(word, self.data_classes[sentiment]['default_value_prop']))
            prob_values.append(np.log(self.data_classes[sentiment]['percentage']))
            probs[sentiment] = {'prob': sum(prob_values), 'probs': prob_values}
            if (probs[sentiment]['prob'] > current_value_selected):
                current_value_selected = probs[sentiment]['prob'] 
                selected = sentiment
        probs['selected'] = selected
        return probs           
    
    def sentences2words(self, sentences):
        """
        receives a list of strings (sentences) ['hello world', 'test, sentence!'] 
        and returns for each sentence a split of its words: [['hello','world'], ['test','sentence']]
        using gensim simple_preprocess function
        """
        words = []
        for sentence in sentences:
            words.append(simple_preprocess(sentence, deacc=True))
            # alternative:
            #words.append([i.strip() for i in re.split(',| |_|-|!|\.|;|:', sentence) if len(i.strip())>0])
        return words
        
    def remove_stopwords(self, list_of_list_of_words):
        """
        receives a list of list of words [['abc', 'abc', ...], ...]
        """
        words = []
        for sentence_as_words in list_of_list_of_words:
            words.append([word for word in sentence_as_words if word not in self.stopwords])
        return words
            
    def lemmatization(self, list_of_list_of_words):
        """
        receives a list of list of words [['swimming','after','playing']]
        and returns the same list with each words lemma: [['swim','after','play']]
        """
        words = []        
        for sentence_as_words in list_of_list_of_words:
            doc = self.nlp(' '.join(sentence_as_words))
            words.append([token.lemma_ for token in doc if token.pos_ in self.allowed_postags ])
        return words
    
    def array2dto1d(self, array2d):
        """
        receives a list of list of words [['hello','world'],['test']] 
        and returns in a single list ['hello','world','test']
        """
        result = []
        for array1d in array2d:
            result.extend(array1d)
        return result
            
    def train_ngrams(self, list_of_list_of_words, ngrams=2, min_count=5, threshold=10):
        if ngrams < 2:
            ngrams = 2
        result = list_of_list_of_words
        ngram_models = []
        for i in range(ngrams-1):
            ngram_phraser = Phrases(result, min_count=min_count, threshold=threshold)
            ngram_model = Phraser(ngram_phraser)
            ngram_models.append(ngram_model)
            result = list(ngram_model[result])
            
        return ngram_models
    
    def create_ngrams(self, ngram_model_array, list_of_list_of_words):
        """ngram_model = []"""
        result = list_of_list_of_words
        for ngram_model in ngram_model_array:
            result = list(ngram_model[result])
        return result

## Función para obtener las métricas de un modelo
Obtiene todas las métricas al evaluar sobre un dataframe. El dataframe debe tener las columnas review y sentiment. Calcula:
- **evaluated**: La cantidad de registros evaluados
- **tp_rate**: Ratio de Verdaderos Positivos
- **tn_rate**: Ratio de Verdaderos Negativos
- **fp_rate**: Ratio de Falsos Positivos
- **fn_rate**: Ratio de Falsos Negativos
- **accuracy**: (tp + tn)/(tp+fp+fn+tn)
- **precision**: tp / (tp + fp)
- **recall**: tp / (tp + fn)
- **f1**: (2 * precision * recall)/(precision + recall)

In [4]:
def get_metrics(model, df):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    predicted_list = []
    for index, row in df.iterrows():
        review = row['review']
        original = row['sentiment']
        
        # predict
        predicted = model.predict(review)['selected']
        predicted_list.append(predicted)
    
        # calculate metrics
        if predicted==Sentiments.POS.value and original==Sentiments.POS.value:
            tp += 1
        elif predicted==Sentiments.NEG.value and original==Sentiments.NEG.value:
            tn += 1
        elif predicted==Sentiments.POS.value and original==Sentiments.NEG.value:
            fp += 1
        elif predicted==Sentiments.NEG.value and original==Sentiments.POS.value:
            fn += 1
    
    # calculate final metrics
    accuracy = (tp + tn)/(tp+fp+fn+tn) if (tp+fp+fn+tn)>0 else 0
    precision = tp / (tp + fp) if (tp+fp) > 0 else -1
    recall = tp / (tp + fn) if (tp + fn) > 0 else -1
    f1 = 2*precision*recall/(precision + recall) if (precision + recall > 0) else -1
    
    return {
        'evaluated':len(predicted_list),
        'f1':f1,
        'precision':precision,
        'recall':recall,
        'accuracy':accuracy,
        'tp_rate':tp/len(predicted_list),
        'tn_rate':tn/len(predicted_list),
        'fp_rate':fp/len(predicted_list),
        'fn_rate':fn/len(predicted_list),
        
        #'results':predicted_list
    }

In [5]:
def print_metrics(model_metrics):
    print('    Evaluados: '+'%.0f'%model_metrics['evaluated'])
    
    # confusion matrix
    print('      TP Rate: '+'%.4f'%model_metrics['tp_rate'] + ' (%.0f)'%(model_metrics["evaluated"]*model_metrics["tp_rate"]))
    print('      FP Rate: '+'%.4f'%model_metrics['fp_rate'] + ' (%.0f)'%(model_metrics["evaluated"]*model_metrics["fp_rate"]))
    print('      TN Rate: '+'%.4f'%model_metrics['tn_rate'] + ' (%.0f)'%(model_metrics["evaluated"]*model_metrics["tn_rate"]))
    print('      FN Rate: '+'%.4f'%model_metrics['fn_rate'] + ' (%.0f)'%(model_metrics["evaluated"]*model_metrics["fn_rate"]))
    
    # calculated metrics
    print('    Accuracy: '+'%.4f'%model_metrics['accuracy'])
    print('    Precision: '+'%.4f'%model_metrics['precision'])
    print('    Recall: '+'%.4f'%model_metrics['recall'])
    print('    F1: '+'%.4f'%model_metrics['f1'])

## Carga de datos para entrenar, validar y probar el modelo

In [6]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
validation = pd.read_csv('data/validation.csv')

In [7]:
train

Unnamed: 0,rating,review,sentiment
0,50,This is one of the best hotels I've ever staye...,POS
1,50,Everything about this hotel was awesome. The s...,POS
2,50,Our tour group stayed here for two nights. Th...,POS
3,50,Excellent service at Porta Hotel Antigua. From...,POS
4,50,I almost always stay at Hotel Antigua when I t...,POS
...,...,...,...
10972,50,I was there with a Belize delegation of about ...,POS
10973,40,Last week I stayed at the Camino Real in Antig...,POS
10974,50,My boyfriend was in Guate on business and we d...,POS
10975,40,I stayed at Camino Real Antigua for a conferen...,POS


In [8]:
test

Unnamed: 0,rating,review,sentiment
0,50,I would definitely stay here again in Antigua....,POS
1,40,"Great location, in the heart of historic centr...",POS
2,50,"Not only the place is nice, clean and in an ex...",POS
3,50,We spent two nights and I wish we could have s...,POS
4,50,I just recently returned from Antigua and my s...,POS
...,...,...,...
2348,50,We didn't have much of a plan upon arriving to...,POS
2349,40,This hotel was very close to the parque centra...,POS
2350,50,"good experience i highly recommend, the food...",POS
2351,50,Centrally located....12 small rooms. Would vou...,POS


In [9]:
validation

Unnamed: 0,rating,review,sentiment
0,50,Cucuruchos is a great place to spend your time...,POS
1,30,We booked this place through Booking for one n...,NEG
2,30,An odd mix of positives and negatives : +ves h...,NEG
3,30,We had been recommended this hostel because of...,NEG
4,20,I knew this place is right underneath the Sky ...,NEG
...,...,...,...
2347,50,Very attractive rooms and grounds. The outside...,POS
2348,50,SIMPLY - Very beautiful - extremely clean - qu...,POS
2349,50,We stayed here for 3 nights and 1 night in the...,POS
2350,40,This is a very nice boutique hotel. Staff is ...,POS


## Ejemplo modelo simple
Se entrena un modelo simple para hacer pruebas del flujo completo

In [10]:
model_test = Model(
    df = train
    , steps = ['remove_stopwords','ngram']
    , nlp_model = 'en_core_web_sm'
    , ngrams = 2
    , min_count = 5
    , threshold = 10
    , allowed_postags = ALLOWED_POSTAGS
    , debug = False
)
model_test.fit()

2022-07-04 02:22:00.199936 POS class start
2022-07-04 02:22:00.200055   sentences_as_words
2022-07-04 02:22:03.991336   remove_stopwords
2022-07-04 02:22:04.193294   ngram
2022-07-04 02:22:07.686242 NEG class start
2022-07-04 02:22:07.686469   sentences_as_words
2022-07-04 02:22:08.329028   remove_stopwords
2022-07-04 02:22:08.363317   ngram
2022-07-04 02:22:09.006042 probs
2022-07-04 02:22:09.723569 end


**Sanity check** de un caso negativo y uno positivo

In [11]:
result = model_test.predict("the hotel was dirty and noisy")
print(result)
print(f'>> Selected class {result["selected"]}')

{'POS': {'prob': -21.14690692178164, 'probs': [-3.915511649867557, -9.478052593473269, -7.623772767263879, -0.1295699111769321]}, 'NEG': {'prob': -19.56974651206995, 'probs': [-4.28513024399396, -6.872302738240207, -6.304693302624039, -2.1076202272117475]}, 'selected': 'NEG'}
>> Selected class NEG


In [12]:
result = model_test.predict("the hotel was very clean and I love it! :)")
print(result)
print(f'>> Selected class {result["selected"]}')

{'POS': {'prob': -15.605211573416558, 'probs': [-3.915511649867557, -4.785817268499889, -6.77431274387218, -0.1295699111769321]}, 'NEG': {'prob': -20.27825475622301, 'probs': [-4.28513024399396, -5.520297450598949, -8.365206834418355, -2.1076202272117475]}, 'selected': 'POS'}
>> Selected class POS


Probar con la data de validación para **obtener las métricas**

In [13]:
model_metrics = get_metrics(model=model_test, df=validation)
print_metrics(model_metrics)

    Evaluados: 2352
      TP Rate: 0.8410 (1978)
      FP Rate: 0.0685 (161)
      TN Rate: 0.0663 (156)
      FN Rate: 0.0242 (57)
    Accuracy: 0.9073
    Precision: 0.9247
    Recall: 0.9720
    F1: 0.9478


## Definir todos los modelos a entrenar
Se creó un dataframe para incluir todos los modelos con los que se desea entrenar para hacer pruebas

In [14]:
models_to_train = []

# ###################################
# solo stopwords
# ###################################
models_to_train.append({
    'steps': ['remove_stopwords']
    , 'ngrams': None, 'min_count': None, 'threshold': None
    , 'nlp_model': None, 'allowed_postags': None
})

# ###################################
# solo ngrams
# ###################################
models_to_train.append({
    'steps': ['ngram']
    , 'ngrams': 2, 'min_count': 7, 'threshold': 20
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 10
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['ngram']
    , 'ngrams': 2, 'min_count': 10, 'threshold': 50
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['ngram']
    , 'ngrams': 3, 'min_count': 7, 'threshold': 20
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['ngram']
    , 'ngrams': 4, 'min_count': 7, 'threshold': 20
    , 'nlp_model': None, 'allowed_postags': None
})

# ###################################
# stopwords y bigramas
# variando min_count y threshold
# ###################################
models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 1, 'threshold': 5
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 3, 'threshold': 5
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 10
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 20
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 7, 'threshold': 30
    , 'nlp_model': None, 'allowed_postags': None
})


models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 15, 'threshold': 30
    , 'nlp_model': None, 'allowed_postags': None
})

# ###################################
# modelos con stopwords y trigramas+
# variando min_count y threshold
# ###################################
models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 3, 'min_count': 5, 'threshold': 20
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 4, 'min_count': 5, 'threshold': 20
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 4, 'min_count': 2, 'threshold': 5
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 3, 'min_count': 15, 'threshold': 30
    , 'nlp_model': None, 'allowed_postags': None
})

# ###################################
# solo lematizando
# variando el modelo de spacy
# ###################################
models_to_train.append({
    'steps': ['lemmatization']
    , 'ngrams': None, 'min_count': None, 'threshold': None
    , 'nlp_model': 'en_core_web_sm', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['lemmatization']
    , 'ngrams': None, 'min_count': None, 'threshold': None
    , 'nlp_model': 'en_core_web_md', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['lemmatization']
    , 'ngrams': None, 'min_count': None, 'threshold': None
    , 'nlp_model': 'en_core_web_lg', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['lemmatization']
    , 'ngrams': None, 'min_count': None, 'threshold': None
    , 'nlp_model': 'en_core_web_trf', 'allowed_postags': ALLOWED_POSTAGS
})

# ###################################
# lematizando y stopwords
# variando el modelo de spacy
# ###################################
models_to_train.append({
    'steps': ['remove_stopwords','lemmatization']
    , 'ngrams': None, 'min_count': None, 'threshold': None
    , 'nlp_model': 'en_core_web_lg', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['remove_stopwords','lemmatization']
    , 'ngrams': None, 'min_count': None, 'threshold': None
    , 'nlp_model': 'en_core_web_trf', 'allowed_postags': ALLOWED_POSTAGS
})


# ###################################
# lematizacion y con ngrams
# variando el modelo de spacy
# ###################################
models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 20
    , 'nlp_model': 'en_core_web_sm', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 20
    , 'nlp_model': 'en_core_web_md', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 20
    , 'nlp_model': 'en_core_web_lg', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 20
    , 'nlp_model': 'en_core_web_trf', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 3, 'min_count': 5, 'threshold': 25
    , 'nlp_model': 'en_core_web_lg', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 3, 'min_count': 5, 'threshold': 25
    , 'nlp_model': 'en_core_web_lg', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 3, 'min_count': 7, 'threshold': 25
    , 'nlp_model': 'en_core_web_lg', 'allowed_postags': ALLOWED_POSTAGS
})

# ###################################
# lematizacion y con ngrams
# variando el listado de postags
# anteriormente se habia agregado por
# default 'PART', por lo que se prueba
# quitarlo
# ###################################
models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 2, 'min_count': 7, 'threshold': 25
    , 'nlp_model': 'en_core_web_lg', 'allowed_postags': ['NOUN', 'ADJ', 'VERB', 'ADV']
})


## Entrenar todos los modelos
Se obtienen todas las métricas de los modelos

In [15]:
model_results = []
for model_to_train in models_to_train:
    print('*******************************************')
    print(f'Start training model {len(model_results)+1}/{len(models_to_train)}')
    print('*******************************************')
    print(model_to_train)
    print('')
    
    # build model
    model = Model(
        df = train
        , steps = model_to_train['steps']
        , nlp_model = model_to_train['nlp_model']
        , ngrams = model_to_train['ngrams']
        , min_count = model_to_train['min_count']
        , threshold = model_to_train['threshold']
        , allowed_postags = model_to_train['allowed_postags']
        , debug = False
    )
    
    # train model
    time_start_training = datetime.now()
    model.fit()
    model_to_train['model'] = model
    train_min = (datetime.now()-time_start_training).total_seconds()/60.0
    model_to_train['train_min'] = train_min
    print('Training time: '+'%.2f'%train_min+' min')
    
    # test model with validation dataframe
    print('')
    print(f'{datetime.now()} validation metrics')
    time_start_eval = datetime.now()
    model_metrics = get_metrics(model=model, df=validation)
    eval_sec = (datetime.now()-time_start_eval).total_seconds()
    print_metrics(model_metrics)
    print('Evaluation time: '+'%.2f'%eval_sec+' seconds')
    
    # append results
    model_results.append({**model_to_train, **model_metrics, 'eval_sec':eval_sec})
    print('')


*******************************************
Start training model 1/30
*******************************************
{'steps': ['remove_stopwords'], 'ngrams': None, 'min_count': None, 'threshold': None, 'nlp_model': None, 'allowed_postags': None}

2022-07-04 02:22:13.191574 POS class start
2022-07-04 02:22:13.191661   sentences_as_words
2022-07-04 02:22:16.967081   remove_stopwords
2022-07-04 02:22:17.389771 NEG class start
2022-07-04 02:22:17.390568   sentences_as_words
2022-07-04 02:22:18.037089   remove_stopwords
2022-07-04 02:22:18.073900 probs
2022-07-04 02:22:18.795371 end
Training time: 0.09 min

2022-07-04 02:22:18.814349 validation metrics
    Evaluados: 2352
      TP Rate: 0.8282 (1948)
      FP Rate: 0.0514 (121)
      TN Rate: 0.0833 (196)
      FN Rate: 0.0370 (87)
    Accuracy: 0.9116
    Precision: 0.9415
    Recall: 0.9572
    F1: 0.9493
Evaluation time: 2.87 seconds

*******************************************
Start training model 2/30
************************************



2022-07-04 03:12:49.474444 NEG class start
2022-07-04 03:12:49.474910   sentences_as_words
2022-07-04 03:12:50.146339   lemmatization
2022-07-04 03:17:11.222597 probs
2022-07-04 03:17:11.888118 end
Training time: 31.00 min

2022-07-04 03:17:11.911661 validation metrics
    Evaluados: 2352
      TP Rate: 0.8265 (1944)
      FP Rate: 0.0434 (102)
      TN Rate: 0.0914 (215)
      FN Rate: 0.0387 (91)
    Accuracy: 0.9179
    Precision: 0.9501
    Recall: 0.9553
    F1: 0.9527
Evaluation time: 818.09 seconds

*******************************************
Start training model 21/30
*******************************************
{'steps': ['remove_stopwords', 'lemmatization'], 'ngrams': None, 'min_count': None, 'threshold': None, 'nlp_model': 'en_core_web_lg', 'allowed_postags': ['NOUN', 'ADJ', 'VERB', 'ADV', 'PART']}

2022-07-04 03:30:50.008580 POS class start
2022-07-04 03:30:50.008667   sentences_as_words
2022-07-04 03:30:53.938039   remove_stopwords
2022-07-04 03:30:54.223152   lemmatization

In [16]:
df_results = pd.DataFrame(model_results)
df_results.drop(['model'],axis=1)

Unnamed: 0,steps,ngrams,min_count,threshold,nlp_model,allowed_postags,train_min,evaluated,f1,precision,recall,accuracy,tp_rate,tn_rate,fp_rate,fn_rate,eval_sec
0,[remove_stopwords],,,,,,0.093824,2352,0.949318,0.941518,0.957248,0.911565,0.828231,0.083333,0.051446,0.03699,2.873141
1,[ngram],2.0,7.0,20.0,,,0.204473,2352,0.945039,0.935484,0.954791,0.903912,0.826105,0.077806,0.056973,0.039116,3.764681
2,[ngram],2.0,5.0,10.0,,,0.20468,2352,0.942962,0.939512,0.946437,0.900935,0.818878,0.082058,0.052721,0.046344,3.76631
3,[ngram],2.0,10.0,50.0,,,0.205255,2352,0.946986,0.933206,0.961179,0.906888,0.831633,0.075255,0.059524,0.033588,3.785317
4,[ngram],3.0,7.0,20.0,,,0.31937,2352,0.944539,0.931232,0.958231,0.902636,0.829082,0.073554,0.061224,0.036139,4.486029
5,[ngram],4.0,7.0,20.0,,,0.422243,2352,0.946937,0.929891,0.964619,0.906463,0.834609,0.071854,0.062925,0.030612,5.204129
6,"[remove_stopwords, ngram]",2.0,1.0,5.0,,,0.165792,2352,0.946522,0.912825,0.982801,0.903912,0.85034,0.053571,0.081207,0.014881,3.324612
7,"[remove_stopwords, ngram]",2.0,3.0,5.0,,,0.159784,2352,0.947243,0.921077,0.974939,0.906037,0.843537,0.0625,0.072279,0.021684,3.338723
8,"[remove_stopwords, ngram]",2.0,5.0,10.0,,,0.164007,2352,0.947772,0.924731,0.97199,0.907313,0.840986,0.066327,0.068452,0.024235,3.37142
9,"[remove_stopwords, ngram]",2.0,5.0,20.0,,,0.162186,2352,0.949038,0.928941,0.970025,0.909864,0.839286,0.070578,0.064201,0.025935,3.380885


## Buscar mejor modelo
Para el mejor modelo se seleccionó la métrica de F1, por lo que se ordenó el dataframe por F1

In [17]:
df_results = df_results.sort_values(by=['f1'], ascending=False)
df_results.drop(['model'],axis=1)

Unnamed: 0,steps,ngrams,min_count,threshold,nlp_model,allowed_postags,train_min,evaluated,f1,precision,recall,accuracy,tp_rate,tn_rate,fp_rate,fn_rate,eval_sec
18,[lemmatization],,,,en_core_web_lg,"[NOUN, ADJ, VERB, ADV, PART]",4.642231,2352,0.954245,0.950292,0.958231,0.920493,0.829082,0.091412,0.043367,0.036139,118.523124
19,[lemmatization],,,,en_core_web_trf,"[NOUN, ADJ, VERB, ADV, PART]",31.00459,2352,0.952708,0.950147,0.955283,0.917942,0.826531,0.091412,0.043367,0.03869,818.085331
20,"[remove_stopwords, lemmatization]",,,,en_core_web_lg,"[NOUN, ADJ, VERB, ADV, PART]",3.246148,2352,0.952033,0.943533,0.960688,0.916241,0.831207,0.085034,0.049745,0.034014,84.307367
17,[lemmatization],,,,en_core_web_md,"[NOUN, ADJ, VERB, ADV, PART]",4.674562,2352,0.951166,0.95,0.952334,0.915391,0.82398,0.091412,0.043367,0.041241,121.869415
29,"[remove_stopwords, lemmatization, ngram]",2.0,7.0,25.0,en_core_web_lg,"[NOUN, ADJ, VERB, ADV]",3.313224,2352,0.950629,0.936576,0.965111,0.913265,0.835034,0.078231,0.056548,0.030187,84.878537
21,"[remove_stopwords, lemmatization]",,,,en_core_web_trf,"[NOUN, ADJ, VERB, ADV, PART]",20.588683,2352,0.949988,0.943314,0.956757,0.91284,0.827806,0.085034,0.049745,0.037415,511.524469
22,"[remove_stopwords, lemmatization, ngram]",2.0,5.0,20.0,en_core_web_sm,"[NOUN, ADJ, VERB, ADV, PART]",3.067748,2352,0.949951,0.939452,0.960688,0.912415,0.831207,0.081207,0.053571,0.034014,78.328459
28,"[remove_stopwords, lemmatization, ngram]",3.0,7.0,25.0,en_core_web_lg,"[NOUN, ADJ, VERB, ADV, PART]",3.340769,2352,0.949602,0.932292,0.967568,0.911139,0.83716,0.07398,0.060799,0.028061,88.051997
0,[remove_stopwords],,,,,,0.093824,2352,0.949318,0.941518,0.957248,0.911565,0.828231,0.083333,0.051446,0.03699,2.873141
10,"[remove_stopwords, ngram]",2.0,7.0,30.0,,,0.166183,2352,0.949071,0.932638,0.966093,0.910289,0.835884,0.074405,0.060374,0.029337,3.450147


In [18]:
# model with best F1
df_results.head(1).drop(['model'],axis=1)

Unnamed: 0,steps,ngrams,min_count,threshold,nlp_model,allowed_postags,train_min,evaluated,f1,precision,recall,accuracy,tp_rate,tn_rate,fp_rate,fn_rate,eval_sec
18,[lemmatization],,,,en_core_web_lg,"[NOUN, ADJ, VERB, ADV, PART]",4.642231,2352,0.954245,0.950292,0.958231,0.920493,0.829082,0.091412,0.043367,0.036139,118.523124


In [19]:
best_model=df_results.iloc[0]['model']

## Sanity Check
Satinity check del mejor modelo. Se hace prueba con un comentario positivo y uno negativo

In [20]:
result = best_model.predict("the hotel was dirty and noisy")
print(result)
print(f'>> Selected class {result["selected"]}')

{'POS': {'prob': -20.99324952021314, 'probs': [-3.952062894899984, -9.552681513548059, -7.3589352005881645, -0.1295699111769321]}, 'NEG': {'prob': -19.400038181453695, 'probs': [-4.20068884579006, -6.825672448193656, -6.266056660258234, -2.1076202272117475]}, 'selected': 'NEG'}
>> Selected class NEG


In [21]:
result = best_model.predict("the hotel was very clean and I love it! :)")
print(result)
print(f'>> Selected class {result["selected"]}')

{'POS': {'prob': -18.60460706303055, 'probs': [-3.952062894899984, -4.0390007830047745, -4.761291130350355, -5.722682343598503, -0.1295699111769321]}, 'NEG': {'prob': -23.9253527829405, 'probs': [-4.20068884579006, -4.52200098898739, -5.358499608352833, -7.736543112598472, -2.1076202272117475]}, 'selected': 'POS'}
>> Selected class POS


## Métricas en dataframe de Test
Una vez seleccionado el mejor modelo se calculan las métricas utilizando el dataframe de test

In [22]:
# calculate test dataframe metrics with best model
model_metrics = get_metrics(model=best_model, df=test)
print_metrics(model_metrics)

    Evaluados: 2353
      TP Rate: 0.8266 (1945)
      FP Rate: 0.0378 (89)
      TN Rate: 0.0867 (204)
      FN Rate: 0.0489 (115)
    Accuracy: 0.9133
    Precision: 0.9562
    Recall: 0.9442
    F1: 0.9502


## Modelo Alternativo
Como se puede observar en la tabla de resumen donde se encuentran todos los modelos, el modelo elegido utiliza el modelo de spacy en_core_web_lg y requiere más de 4.6 min para entrenar y más de 118 segundos para ejecutar los datos de validación (2352 datos). Considerando esto, si la aplicación requiriera que el entrenamiento e inferencia sean más rápidos se pudiera considerar el 3er mejor modelo, ya que entrena únicamente en 3.2 min y hace la inferencia de los casos de validación en 85 segundos (1.4x más rápido) y el performance en F1 disminuye únicamente por 0.0022 con la data de validación

In [23]:
model_alt=df_results.iloc[2]['model']

In [24]:
# calculate test dataframe metrics with best model
model_metrics = get_metrics(model=model_alt, df=test)
print_metrics(model_metrics)

    Evaluados: 2353
      TP Rate: 0.8262 (1944)
      FP Rate: 0.0387 (91)
      TN Rate: 0.0858 (202)
      FN Rate: 0.0493 (116)
    Accuracy: 0.9120
    Precision: 0.9553
    Recall: 0.9437
    F1: 0.9495


Como podemos observar, el F1 de este modelo en la data de test disminuyó únicamente en 0.0007 respecto al mejor modelo, pero tomando el 71% del tiempo en inferir los datos (29% de mejora respecto al mejor modelo), por lo que dependiendo de la aplicación también se podría utilizar este modelo.

# Análisis de resultados
De la tabla de resumen se pueden obtener las siguientes conclusiones:
- Los modelos que lematizan (con cualquier modelo de spacy) son los que entregan mejor performance
- Como se puede observar, los resultados de validation y test son bastante similares, lo que nos indica que el modelo generalizó bastante bien (no hizo overfitting)
- El remover stopwords antes de lematizar se vio que hace que el entrenamiento y la inferencia sean más rápidos aunque disminuyen ligeramente el performance a pesar que técnicamente en el paso de lematizar también se eliminan stopwords al utilizar los allowed postags
- El utilizar spacy y lematizar hace que el entrenamiento e inferencia sean lentos, por lo que si se requiere un modelo con el mejor tiempo de entrenamiento e inferencia se puede utilizar únicamente el remover stopwords (el 8vo mejor modelo), el cual obtuvo únicamente 0.0049 menos performance en F1 respecto al mejor modelo (F1=94.93%), pero con un tiempo de entrenamiento de 0.09min vs 4.64min y un tiempo de inferencia de 2.9s vs 118s, por lo que el performance no se degradaría tanto y el modelo sería mucho más rápido
- El aplicar cualquier nivel de ngrams parecía disminuir performance de los modelos en general
- Se realizaron pruebas con diferentes modelos de spacy y, como era de esperar, los modelos más pesados fueron los que mejor performance dieron (encore_web_trf > en_core_web_lg > en_core_web_md > en_core_web_sm) aunque el tiempo de entrenamiento e inferencia del trf fue bastante más lento que los otros

# Guardar resultados
Dado que el entrenamiento de los 30 modelos toma alrededor de 2h (tiempo medido en un servidor, no en computadora personal) se decidió guardar el resultado en un pickle para su uso posterior

In [25]:
df_results.to_pickle('data/model_results.pkl')

In [26]:
df_results = pd.read_pickle('data/model_results.pkl')

In [27]:
df_results

Unnamed: 0,steps,ngrams,min_count,threshold,nlp_model,allowed_postags,model,train_min,evaluated,f1,precision,recall,accuracy,tp_rate,tn_rate,fp_rate,fn_rate,eval_sec
18,[lemmatization],,,,en_core_web_lg,"[NOUN, ADJ, VERB, ADV, PART]",<__main__.Model object at 0x7efc5c0030f0>,4.642231,2352,0.954245,0.950292,0.958231,0.920493,0.829082,0.091412,0.043367,0.036139,118.523124
19,[lemmatization],,,,en_core_web_trf,"[NOUN, ADJ, VERB, ADV, PART]",<__main__.Model object at 0x7efc5b358c18>,31.00459,2352,0.952708,0.950147,0.955283,0.917942,0.826531,0.091412,0.043367,0.03869,818.085331
20,"[remove_stopwords, lemmatization]",,,,en_core_web_lg,"[NOUN, ADJ, VERB, ADV, PART]",<__main__.Model object at 0x7efc5a6acef0>,3.246148,2352,0.952033,0.943533,0.960688,0.916241,0.831207,0.085034,0.049745,0.034014,84.307367
17,[lemmatization],,,,en_core_web_md,"[NOUN, ADJ, VERB, ADV, PART]",<__main__.Model object at 0x7efc58716d30>,4.674562,2352,0.951166,0.95,0.952334,0.915391,0.82398,0.091412,0.043367,0.041241,121.869415
29,"[remove_stopwords, lemmatization, ngram]",2.0,7.0,25.0,en_core_web_lg,"[NOUN, ADJ, VERB, ADV]",<__main__.Model object at 0x7efb868f2c88>,3.313224,2352,0.950629,0.936576,0.965111,0.913265,0.835034,0.078231,0.056548,0.030187,84.878537
21,"[remove_stopwords, lemmatization]",,,,en_core_web_trf,"[NOUN, ADJ, VERB, ADV, PART]",<__main__.Model object at 0x7efb864a4b70>,20.588683,2352,0.949988,0.943314,0.956757,0.91284,0.827806,0.085034,0.049745,0.037415,511.524469
22,"[remove_stopwords, lemmatization, ngram]",2.0,5.0,20.0,en_core_web_sm,"[NOUN, ADJ, VERB, ADV, PART]",<__main__.Model object at 0x7efb860a6940>,3.067748,2352,0.949951,0.939452,0.960688,0.912415,0.831207,0.081207,0.053571,0.034014,78.328459
28,"[remove_stopwords, lemmatization, ngram]",3.0,7.0,25.0,en_core_web_lg,"[NOUN, ADJ, VERB, ADV, PART]",<__main__.Model object at 0x7efb83dcc0f0>,3.340769,2352,0.949602,0.932292,0.967568,0.911139,0.83716,0.07398,0.060799,0.028061,88.051997
0,[remove_stopwords],,,,,,<__main__.Model object at 0x7efb839a12b0>,0.093824,2352,0.949318,0.941518,0.957248,0.911565,0.828231,0.083333,0.051446,0.03699,2.873141
10,"[remove_stopwords, ngram]",2.0,7.0,30.0,,,<__main__.Model object at 0x7efb83428438>,0.166183,2352,0.949071,0.932638,0.966093,0.910289,0.835884,0.074405,0.060374,0.029337,3.450147


## Postags disponibles en spacy
Estos son los postags que incluye spacy

POS|DESCRIPTION|EXAMPLES
---|---|---
ADJ|adjective|*big, old, green, incomprehensible, first*
ADP|adposition|*in, to, during*
ADV|adverb|*very, tomorrow, down, where, there*
AUX|auxiliary|*is, has (done), will (do), should (do)*
CONJ|conjunction|*and, or, but*
CCONJ|coordinating conjunction|*and, or, but*
DET|determiner|*a, an, the*
INTJ|interjection|*psst, ouch, bravo, hello*
NOUN|noun|*girl, cat, tree, air, beauty*
NUM|numeral|*1, 2017, one, seventy-seven, IV, MMXIV*
PART|particle|*’s, not,*
PRON|pronoun|*I, you, he, she, myself, themselves, somebody*
PROPN|proper noun|*Mary, John, London, NATO, HBO*
PUNCT|punctuation|*., (, ), ?*
SCONJ|subordinating conjunction|*if, while, that*
SYM|symbol|*$, %, §, ©, +, −, ×, ÷, =, :), *
VERB|verb|*run, runs, running, eat, ate, eating*
X|other|*sfpksdpsxmsa*
SPACE|space