## Imports

In [162]:
import re
import nltk, enum, spacy
import pandas as pd
import numpy as np
nltk.download('stopwords')

from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from spacy.lang.en import English
from spacy.tokens import Doc
from datetime import datetime
from collections import defaultdict

[nltk_data] Downloading package stopwords to
[nltk_data]     /data/home/kegarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Cargar modelos de spacy
Se agregó como hiperparámetro el modelo de spacy, por lo que se cargaron 4 modelos diferentes

In [79]:
STOPWORDS = set(stopwords.words('english'))
ALLOWED_POSTAGS = ['NOUN', 'ADJ', 'VERB', 'ADV', 'PART']
NLP_MODELS = {
    'en_core_web_trf': spacy.load('en_core_web_trf'),    
    'en_core_web_sm': spacy.load('en_core_web_sm'),
    'en_core_web_md': spacy.load('en_core_web_md'),
    'en_core_web_lg': spacy.load('en_core_web_lg')
}

class Sentiments(enum.Enum):
    POS = 'POS'
    NEG = 'NEG'

## Model class

In [161]:
class Model:
    # not necesarry but just as a 'fyi'
    raw_data = pd.DataFrame() # constructor
    data_classes = {
        'POS': {
            'sentences': [] # array of strings
            , 'words': [] # array of arrays, each array contains each sentence splitted
            , 'words_without_stopwords': [] # same as words but without stopwords
            , 'words_1d': [] # 1d array of words
            , 'lemma': []
            , 'bow': None
            , 'ggram': None
        }
        , 'NEG': {}
    }
    stopwords = STOPWORDS # default if not given
    allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
    
    def __init__(self, df, steps, nlp_model = 'en_core_web_lg', stopwords=STOPWORDS, ngrams=2, min_count=5, threshold=10, allowed_postags=ALLOWED_POSTAGS, debug = False):
        self.stopwords = stopwords
        self.raw_data = df
        self.ngram = {'min_count': min_count, 'threshold': threshold, 'ngrams': ngrams}
        self.steps = steps
        self.allowed_postags = allowed_postags
        self.debug = debug
        if nlp_model not in NLP_MODELS:
            nlp_model = 'en_core_web_lg'
        self.nlp = NLP_MODELS[nlp_model]
    
    
    def fit(self):
        '''
        train model using all parameters
        '''
        # get from raw data a df with data_classes = {'POS':{'sentences':[.. , ..]}, 'NEG':{'sentences':[.. , ..]}}
        self.data_classes = {sentiment.value: {'sentences': self.raw_data[self.raw_data['sentiment'] == sentiment.value]['review'].values.tolist()} for sentiment in Sentiments}
        
        # iterate for each sentiment class
        for sentiment in self.data_classes.keys():
            print(f'{datetime.now()} {sentiment} class start')
            
            # get class percentage (ie: percentage = POS/total sentences)
            percentage = len(self.data_classes[sentiment]['sentences'])/len(self.raw_data.index)
            self.data_classes[sentiment]['percentage'] = percentage
            
            # split sentences into array of words
            print(f'{datetime.now()}   sentences_as_words')
            sentences = self.data_classes[sentiment]['sentences']
            result = self.sentences2words(sentences)
            
            if (self.debug):
                self.data_classes[sentiment]['sentences_as_words'] = result
            
            # iterate the pipeline
            for step in self.steps:
                print(f'{datetime.now()}   {step}')
                
                if step == 'remove_stopwords':
                    result = self.remove_stopwords(result)
                    if (self.debug):
                        self.data_classes[sentiment]['words_without_stopwords'] = result
                
                elif step == 'lemmatization':
                    result = self.lemmatization(result)
                    if (self.debug):
                        self.data_classes[sentiment]['lemmas'] = result
                
                elif step == 'ngram':
                    # train ngram model
                    ngram_model = self.train_ngrams(result, ngrams=self.ngram['ngrams'], min_count=self.ngram['min_count'], threshold=self.ngram['threshold'])
                    
                    if len(ngram_model)>0:
                        self.data_classes[sentiment]['ngram_model'] = ngram_model
                        
                        # apply ngram model
                        result = self.create_ngrams(ngram_model, result)
                        if (self.debug):
                            self.data_classes[sentiment]['ngrams'] = result
                    else:
                        print(f'{datetime.now()} ngram not done: {self.ngram["ngram"]}')
                
                else:
                    print(f'{datetime.now()} instruction not found: {step}')
            
            # save all words per sentiment class
            words = self.array2dto1d(result)
            self.data_classes[sentiment]['words'] = words
            
            # if not debug remove sentences 
            if (not self.debug):
                del self.data_classes[sentiment]['sentences']
            
        print(f'{datetime.now()} probs')
        
        # build word dictionary of all sentiment classes
        all_words = []
        for sentiment in self.data_classes.keys():
            all_words.append(self.data_classes[sentiment]['words'])
        self.dictionary = Dictionary(all_words)
        
        # calculate word probabilities for each sentiment class
        for sentiment in self.data_classes.keys():
            # calculate bag of words
            self.data_classes[sentiment]['bow'] = self.dictionary.doc2bow(self.data_classes[sentiment]['words'])
            
            # calculate probability of each word
            self.data_classes[sentiment]['total_length'] = len(self.data_classes[sentiment]['words']) + len(self.dictionary)
            word_probs = defaultdict(lambda: np.log(1/self.data_classes[sentiment]['total_length'])) # default value
            for id, count in self.data_classes[sentiment]['bow']:
                word_probs[self.dictionary[id]] = np.log((count + 1)/self.data_classes[sentiment]['total_length']) # {'word': prob}
            self.data_classes[sentiment]['word_probs'] = word_probs

            # if not debug, remove words
            if (not self.debug):
                del self.data_classes[sentiment]['words']
                del self.data_classes[sentiment]['bow']
        print(f'{datetime.now()} end')
        
    
    def predict(self, sentence, debug=False):
        '''
        predict sentiment class of one sentence
        '''
        sentences = [sentence]
        probs = {}
        selected = None
        current_value_selected = float('-inf')
        
        # iterate for each sentiment class
        for sentiment in self.data_classes.keys():
            result = self.sentences2words(sentences)
            
            # apply pipeline to sentence
            for step in self.steps:
                if debug:
                    print(f'{datetime.now()} {step}')
                if step == 'remove_stopwords':
                    result = self.remove_stopwords(result)
                elif step == 'lemmatization':
                    result = self.lemmatization(result)
                elif step == 'ngram':
                    if 'ngram_model' in self.data_classes[sentiment] and len(self.data_classes[sentiment]['ngram_model'])>0:                    
                        result = self.create_ngrams(self.data_classes[sentiment]['ngram_model'], result)
                    else:
                        print(f'no ngram model found for {sentiment}')
                else:
                    print(f'instruction not found: {step}')
                if debug:
                    print(f'{datetime.now()} {result}')
            
            # calculate sentiment probability
            prob_values = []
            for one_row in result: # remember we added the sentence to an array
                for word in one_row:
                    prob_values.append(self.data_classes[sentiment]['word_probs'][word])
            prob_values.append(np.log(self.data_classes[sentiment]['percentage']))
            probs[sentiment] = {'prob': sum(prob_values), 'probs': prob_values}
            if (probs[sentiment]['prob'] > current_value_selected):
                current_value_selected = probs[sentiment]['prob'] 
                selected = sentiment
        probs['selected'] = selected
        return probs           
    
    def sentences2words(self, sentences):
        """
        receives a list of strings (sentences) ['hello world', 'test, sentence!'] 
        and returns for each sentence a split of its words: [['hello','world'], ['test','sentence']]
        using gensim simple_preprocess function
        """
        words = []
        for sentence in sentences:
            words.append(simple_preprocess(sentence, deacc=True))
            # alternative:
            #words.append([i.strip() for i in re.split(',| |_|-|!|\.|;|:', sentence) if len(i.strip())>0])
        return words
        
    def remove_stopwords(self, list_of_list_of_words):
        """
        receives a list of list of words [['abc', 'abc', ...], ...]
        """
        words = []
        for sentence_as_words in list_of_list_of_words:
            words.append([word for word in sentence_as_words if word not in self.stopwords])
        return words
            
    def lemmatization(self, list_of_list_of_words):
        """
        receives a list of list of words [['swimming','after','playing']]
        and returns the same list with each words lemma: [['swim','after','play']]
        """
        words = []        
        for sentence_as_words in list_of_list_of_words:
            doc = self.nlp(' '.join(sentence_as_words))
            words.append([token.lemma_ for token in doc if token.pos_ in self.allowed_postags ])
        return words
    
    def array2dto1d(self, array2d):
        """
        receives a list of list of words [['hello','world'],['test']] 
        and returns in a single list ['hello','world','test']
        """
        result = []
        for array1d in array2d:
            result.extend(array1d)
        return result
            
    def train_ngrams(self, list_of_list_of_words, ngrams=2, min_count=5, threshold=10):
        if ngrams < 2:
            ngrams = 2
        result = list_of_list_of_words
        ngram_models = []
        for i in range(ngrams-1):
            ngram_phraser = Phrases(result, min_count=min_count, threshold=threshold)
            ngram_model = Phraser(ngram_phraser)
            ngram_models.append(ngram_model)
            result = list(ngram_model[result])
            
        return ngram_models
    
    def create_ngrams(self, ngram_model_array, list_of_list_of_words):
        """ngram_model = []"""
        result = list_of_list_of_words
        for ngram_model in ngram_model_array:
            result = list(ngram_model[result])
        return result
        
        #dictionary.doc2idx(['abysmal', 'abuse'])
        

## Función para obtener las métricas de un modelo
Obtiene todas las métricas al evaluar sobre un dataframe. El dataframe debe tener las columnas review y sentiment. Calcula:
- **evaluated**: La cantidad de registros evaluados
- **tp_rate**: Ratio de Verdaderos Positivos
- **tn_rate**: Ratio de Verdaderos Negativos
- **fp_rate**: Ratio de Falsos Positivos
- **fn_rate**: Ratio de Falsos Negativos
- **accuracy**: (tp + tn)/(tp+fp+fn+tn)
- **precision**: tp / (tp + fp)
- **recall**: tp / (tp + fn)
- **f1**: (2 * precision * recall)/(precision + recall)

In [101]:
def get_metrics(model, df):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    predicted_list = []
    for index, row in df.iterrows():
        review = row['review']
        original = row['sentiment']
        
        # predict
        predicted = model.predict(review)['selected']
        predicted_list.append(predicted)
    
        # calculate metrics
        if predicted==Sentiments.POS.value and original==Sentiments.POS.value:
            tp += 1
        elif predicted==Sentiments.NEG.value and original==Sentiments.NEG.value:
            tn += 1
        elif predicted==Sentiments.POS.value and original==Sentiments.NEG.value:
            fp += 1
        elif predicted==Sentiments.NEG.value and original==Sentiments.POS.value:
            fn += 1
    
    # calculate final metrics
    accuracy = (tp + tn)/(tp+fp+fn+tn) if (tp+fp+fn+tn)>0 else 0
    precision = tp / (tp + fp) if (tp+fp) > 0 else -1
    recall = tp / (tp + fn) if (tp + fn) > 0 else -1
    f1 = 2*precision*recall/(precision + recall) if (precision + recall > 0) else -1
    
    return {
        'evaluated':len(predicted_list),
        'tp_rate':tp/len(predicted_list),
        'tn_rate':tn/len(predicted_list),
        'fp_rate':fp/len(predicted_list),
        'fn_rate':fn/len(predicted_list),
        'accuracy':accuracy,
        'precision':precision,
        'recall':recall,
        'f1':f1,
        #'results':predicted_list
    }

In [120]:
def print_metrics(model_metrics):
    print('    Evaluados: '+'%.0f'%model_metrics['evaluated'])
    
    # confusion matrix
    print('      TP Rate: '+'%.4f'%model_metrics['tp_rate'] + ' (%.0f)'%(model_metrics["evaluated"]*model_metrics["tp_rate"]))
    print('      FP Rate: '+'%.4f'%model_metrics['fp_rate'] + ' (%.0f)'%(model_metrics["evaluated"]*model_metrics["fp_rate"]))
    print('      TN Rate: '+'%.4f'%model_metrics['tn_rate'] + ' (%.0f)'%(model_metrics["evaluated"]*model_metrics["tn_rate"]))
    print('      FN Rate: '+'%.4f'%model_metrics['fn_rate'] + ' (%.0f)'%(model_metrics["evaluated"]*model_metrics["fn_rate"]))
    
    # calculated metrics
    print('    Accuracy: '+'%.4f'%model_metrics['accuracy'])
    print('    Precision: '+'%.4f'%model_metrics['precision'])
    print('    Recall: '+'%.4f'%model_metrics['recall'])
    print('    F1: '+'%.4f'%model_metrics['f1'])

## Carga de datos para entrenar, validar y probar el modelo

In [83]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
validation = pd.read_csv('data/validation.csv')

In [84]:
train

Unnamed: 0,rating,review,sentiment
0,50,This is one of the best hotels I've ever staye...,POS
1,50,Everything about this hotel was awesome. The s...,POS
2,50,Our tour group stayed here for two nights. Th...,POS
3,50,Excellent service at Porta Hotel Antigua. From...,POS
4,50,I almost always stay at Hotel Antigua when I t...,POS
...,...,...,...
10972,50,I was there with a Belize delegation of about ...,POS
10973,40,Last week I stayed at the Camino Real in Antig...,POS
10974,50,My boyfriend was in Guate on business and we d...,POS
10975,40,I stayed at Camino Real Antigua for a conferen...,POS


In [85]:
test

Unnamed: 0,rating,review,sentiment
0,50,I would definitely stay here again in Antigua....,POS
1,40,"Great location, in the heart of historic centr...",POS
2,50,"Not only the place is nice, clean and in an ex...",POS
3,50,We spent two nights and I wish we could have s...,POS
4,50,I just recently returned from Antigua and my s...,POS
...,...,...,...
2348,50,We didn't have much of a plan upon arriving to...,POS
2349,40,This hotel was very close to the parque centra...,POS
2350,50,"good experience i highly recommend, the food...",POS
2351,50,Centrally located....12 small rooms. Would vou...,POS


In [86]:
validation

Unnamed: 0,rating,review,sentiment
0,50,Cucuruchos is a great place to spend your time...,POS
1,30,We booked this place through Booking for one n...,NEG
2,30,An odd mix of positives and negatives : +ves h...,NEG
3,30,We had been recommended this hostel because of...,NEG
4,20,I knew this place is right underneath the Sky ...,NEG
...,...,...,...
2347,50,Very attractive rooms and grounds. The outside...,POS
2348,50,SIMPLY - Very beautiful - extremely clean - qu...,POS
2349,50,We stayed here for 3 nights and 1 night in the...,POS
2350,40,This is a very nice boutique hotel. Staff is ...,POS


## Ejemplo modelo simple
Se entrena un modelo simple para hacer pruebas del flujo completo

In [116]:
model_test = Model(
    df = train
    , steps = ['remove_stopwords','ngram']
    , nlp_model = 'en_core_web_sm'
    , ngrams = 2
    , min_count = 5
    , threshold = 10
    , allowed_postags = ALLOWED_POSTAGS
    , debug = False
)
model_test.fit()

2022-07-03 00:58:41.779274 POS class start
2022-07-03 00:58:41.779398   sentences_as_words
2022-07-03 00:58:45.619335   remove_stopwords
2022-07-03 00:58:45.869781   ngram
2022-07-03 00:58:49.411947 NEG class start
2022-07-03 00:58:49.412734   sentences_as_words
2022-07-03 00:58:50.064902   remove_stopwords
2022-07-03 00:58:50.106881   ngram
2022-07-03 00:58:50.752851 probs
2022-07-03 00:58:51.532143 end


**Sanity check** de un caso negativo y uno positivo

In [88]:
result = model_test.predict("the hotel was dirty and noisy")
print(result)
print(f'>> Selected class {result["selected"]}')

{'POS': {'prob': -21.14690692178164, 'probs': [-3.915511649867557, -9.478052593473269, -7.623772767263879, -0.1295699111769321]}, 'NEG': {'prob': -19.56974651206995, 'probs': [-4.28513024399396, -6.872302738240207, -6.304693302624039, -2.1076202272117475]}, 'selected': 'NEG'}
>> Selected class NEG


In [89]:
result = model_test.predict("the hotel was very clean and I love it! :)")
print(result)
print(f'>> Selected class {result["selected"]}')

{'POS': {'prob': -15.605211573416558, 'probs': [-3.915511649867557, -4.785817268499889, -6.77431274387218, -0.1295699111769321]}, 'NEG': {'prob': -20.27825475622301, 'probs': [-4.28513024399396, -5.520297450598949, -8.365206834418355, -2.1076202272117475]}, 'selected': 'POS'}
>> Selected class POS


Probar con la data de validación para **obtener las métricas**

In [121]:
model_metrics = get_metrics(model=model_test, df=validation)
print_metrics(model_metrics)

    Evaluados: 2352
      TP Rate: 0.8474 (1993)
      FP Rate: 0.0753 (177)
      TN Rate: 0.0595 (140)
      FN Rate: 0.0179 (42)
    Accuracy: 0.9069
    Precision: 0.9184
    Recall: 0.9794
    F1: 0.9479


## Definir todos los modelos a entrenar
Se creó un dataframe para incluir todos los modelos con los que se desea entrenar para hacer pruebas

In [163]:
models_to_train = []

# ###################################
# solo stopwords
# ###################################
models_to_train.append({
    'steps': ['remove_stopwords']
    , 'ngrams': None, 'min_count': None, 'threshold': None
    , 'nlp_model': None, 'allowed_postags': None
})

# ###################################
# solo ngrams
# ###################################
models_to_train.append({
    'steps': ['ngram']
    , 'ngrams': 2, 'min_count': 7, 'threshold': 20
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 10
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['ngram']
    , 'ngrams': 2, 'min_count': 10, 'threshold': 50
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['ngram']
    , 'ngrams': 3, 'min_count': 7, 'threshold': 20
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['ngram']
    , 'ngrams': 4, 'min_count': 7, 'threshold': 20
    , 'nlp_model': None, 'allowed_postags': None
})

# ###################################
# stopwords y bigramas
# variando min_count y threshold
# ###################################
models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 1, 'threshold': 5
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 3, 'threshold': 5
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 10
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 20
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 7, 'threshold': 30
    , 'nlp_model': None, 'allowed_postags': None
})

# ###################################
# modelos con stopwords y trigramas
# variando min_count y threshold
# ###################################
models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 3, 'min_count': 5, 'threshold': 20
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 4, 'min_count': 5, 'threshold': 20
    , 'nlp_model': None, 'allowed_postags': None
})


# ###################################
# solo lematizando
# variando el modelo de spacy
# ###################################
models_to_train.append({
    'steps': ['lemmatization']
    , 'ngrams': None, 'min_count': None, 'threshold': None
    , 'nlp_model': 'en_core_web_sm', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['lemmatization']
    , 'ngrams': None, 'min_count': None, 'threshold': None
    , 'nlp_model': 'en_core_web_mg', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['lemmatization']
    , 'ngrams': None, 'min_count': None, 'threshold': None
    , 'nlp_model': 'en_core_web_lg', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['lemmatization']
    , 'ngrams': None, 'min_count': None, 'threshold': None
    , 'nlp_model': 'en_core_web_trf', 'allowed_postags': ALLOWED_POSTAGS
})

# ###################################
# lematizacion y con ngrams
# variando el modelo de spacy
# ###################################
models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 20
    , 'nlp_model': 'en_core_web_sm', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 20
    , 'nlp_model': 'en_core_web_md', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 20
    , 'nlp_model': 'en_core_web_lg', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 20
    , 'nlp_model': 'en_core_web_trf', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 3, 'min_count': 5, 'threshold': 25
    , 'nlp_model': 'en_core_web_lg', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 3, 'min_count': 5, 'threshold': 25
    , 'nlp_model': 'en_core_web_lg', 'allowed_postags': ALLOWED_POSTAGS
})

models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 3, 'min_count': 7, 'threshold': 25
    , 'nlp_model': 'en_core_web_lg', 'allowed_postags': ALLOWED_POSTAGS
})

# ###################################
# lematizacion y con ngrams
# variando el listado de postags
# anteriormente se habia agregado por
# default 'PART', por lo que se prueba
# quitarlo
# ###################################
models_to_train.append({
    'steps': ['remove_stopwords','lemmatization','ngram']
    , 'ngrams': 2, 'min_count': 7, 'threshold': 25
    , 'nlp_model': 'en_core_web_lg', 'allowed_postags': ['NOUN', 'ADJ', 'VERB', 'ADV']
})

## Entrenar todos los modelos
Se obtienen todas las métricas de los modelos

In [110]:
model_results = []
for model_to_train in models_to_train:
    print('*******************************************')
    print(f'Start training model {len(model_results)+1}/{len(models_to_train)}')
    print('*******************************************')
    print(model_to_train)
    print('')
    
    # build model
    model = Model(
        df = train
        , steps = model_to_train['steps']
        , nlp_model = model_to_train['nlp_model']
        , ngrams = model_to_train['ngrams']
        , min_count = model_to_train['min_count']
        , threshold = model_to_train['threshold']
        , allowed_postags = model_to_train['allowed_postags']
        , debug = False
    )
    
    # train model
    model.fit()
    model_to_train['model'] = model
    
    # test model with validation dataframe
    print('')
    print(f'{datetime.now()} validation metrics')
    model_metrics = get_metrics(model=model, df=validation)
    print_metrics(model_metrics)
    
    # append results
    model_results.append({**model_to_train, **model_metrics})
    print('')


*******************************************
Start training model 1/10
*******************************************
{'steps': ['remove_stopwords'], 'ngrams': None, 'min_count': None, 'threshold': None, 'nlp_model': None, 'allowed_postags': None, 'model': <__main__.Model object at 0x7f3c707b2588>}

2022-07-03 00:46:25.590672 POS class start
2022-07-03 00:46:25.590768   sentences_as_words
2022-07-03 00:46:29.393514   remove_stopwords
2022-07-03 00:46:29.608382 NEG class start
2022-07-03 00:46:29.609053   sentences_as_words
2022-07-03 00:46:30.256721   remove_stopwords
2022-07-03 00:46:30.293597 probs
2022-07-03 00:46:31.014803 end

2022-07-03 00:46:31.032593 validation metrics
  evaluados: 2352
  tp_rate: 0.8406 (1977)
  tn_rate: 0.0753 (177)
  fp_rate: 0.0595 (140)
  fn_rate: 0.0247 (58)
  accuracy: 0.9158
  precision: 0.9339
  recall: 0.9715
  f1: 0.9523

*******************************************
Start training model 2/10
*******************************************
{'steps': ['ngram'],

2022-07-03 00:48:52.274110 NEG class start
2022-07-03 00:48:52.274897   sentences_as_words
2022-07-03 00:48:52.922402   remove_stopwords
2022-07-03 00:48:52.982278   ngram
2022-07-03 00:48:54.906532 probs
2022-07-03 00:48:55.675279 end

2022-07-03 00:48:55.699022 validation metrics
  evaluados: 2352
  tp_rate: 0.8495 (1998)
  tn_rate: 0.0574 (135)
  fp_rate: 0.0774 (182)
  fn_rate: 0.0157 (37)
  accuracy: 0.9069
  precision: 0.9165
  recall: 0.9818
  f1: 0.9480



In [122]:
df_results = pd.DataFrame(model_results)
df_results.drop(['model'],axis=1)

Unnamed: 0,steps,ngrams,min_count,threshold,nlp_model,allowed_postags,evaluated,tp_rate,tn_rate,fp_rate,fn_rate,accuracy,precision,recall,f1
0,[remove_stopwords],,,,,,2352,0.840561,0.075255,0.059524,0.02466,0.915816,0.933869,0.971499,0.952312
1,[ngram],2.0,10.0,30.0,,,2352,0.835034,0.071854,0.062925,0.030187,0.906888,0.929924,0.965111,0.947191
2,[ngram],3.0,10.0,30.0,,,2352,0.833333,0.071429,0.06335,0.031888,0.904762,0.92935,0.963145,0.945946
3,"[remove_stopwords, ngram]",2.0,1.0,5.0,,,2352,0.855442,0.044643,0.090136,0.009779,0.900085,0.904676,0.988698,0.944823
4,"[remove_stopwords, ngram]",2.0,3.0,5.0,,,2352,0.852041,0.054847,0.079932,0.01318,0.906888,0.914234,0.984767,0.94819
5,"[remove_stopwords, ngram]",2.0,5.0,10.0,,,2352,0.847364,0.059524,0.075255,0.017857,0.906888,0.918433,0.979361,0.947919
6,"[remove_stopwords, ngram]",2.0,5.0,20.0,,,2352,0.845663,0.064626,0.070153,0.019558,0.910289,0.923398,0.977396,0.94963
7,"[remove_stopwords, ngram]",2.0,7.0,30.0,,,2352,0.843112,0.068027,0.066752,0.022109,0.911139,0.926636,0.974447,0.94994
8,"[remove_stopwords, ngram]",3.0,5.0,20.0,,,2352,0.848639,0.058673,0.076105,0.016582,0.907313,0.917701,0.980835,0.948219
9,"[remove_stopwords, ngram]",4.0,5.0,20.0,,,2352,0.84949,0.057398,0.077381,0.015731,0.906888,0.916514,0.981818,0.948043


## Buscar mejor modelo
Para el mejor modelo se seleccionó la métrica de F1

In [123]:
df_results = df_results.sort_values(by=['f1'], ascending=False)
df_results.drop(['model'],axis=1)

Unnamed: 0,steps,ngrams,min_count,threshold,nlp_model,allowed_postags,evaluated,tp_rate,tn_rate,fp_rate,fn_rate,accuracy,precision,recall,f1
0,[remove_stopwords],,,,,,2352,0.840561,0.075255,0.059524,0.02466,0.915816,0.933869,0.971499,0.952312
7,"[remove_stopwords, ngram]",2.0,7.0,30.0,,,2352,0.843112,0.068027,0.066752,0.022109,0.911139,0.926636,0.974447,0.94994
6,"[remove_stopwords, ngram]",2.0,5.0,20.0,,,2352,0.845663,0.064626,0.070153,0.019558,0.910289,0.923398,0.977396,0.94963
8,"[remove_stopwords, ngram]",3.0,5.0,20.0,,,2352,0.848639,0.058673,0.076105,0.016582,0.907313,0.917701,0.980835,0.948219
4,"[remove_stopwords, ngram]",2.0,3.0,5.0,,,2352,0.852041,0.054847,0.079932,0.01318,0.906888,0.914234,0.984767,0.94819
9,"[remove_stopwords, ngram]",4.0,5.0,20.0,,,2352,0.84949,0.057398,0.077381,0.015731,0.906888,0.916514,0.981818,0.948043
5,"[remove_stopwords, ngram]",2.0,5.0,10.0,,,2352,0.847364,0.059524,0.075255,0.017857,0.906888,0.918433,0.979361,0.947919
1,[ngram],2.0,10.0,30.0,,,2352,0.835034,0.071854,0.062925,0.030187,0.906888,0.929924,0.965111,0.947191
2,[ngram],3.0,10.0,30.0,,,2352,0.833333,0.071429,0.06335,0.031888,0.904762,0.92935,0.963145,0.945946
3,"[remove_stopwords, ngram]",2.0,1.0,5.0,,,2352,0.855442,0.044643,0.090136,0.009779,0.900085,0.904676,0.988698,0.944823


In [124]:
# model with best F1
df_results.head(1).drop(['model'],axis=1)

Unnamed: 0,steps,ngrams,min_count,threshold,nlp_model,allowed_postags,evaluated,tp_rate,tn_rate,fp_rate,fn_rate,accuracy,precision,recall,f1
0,[remove_stopwords],,,,,,2352,0.840561,0.075255,0.059524,0.02466,0.915816,0.933869,0.971499,0.952312


## Métricas en dataframe de Test
Una vez seleccionado el mejor modelo se calculan las métricas utilizando el dataframe de test

In [128]:
model=df_results.iloc[0]['model']

In [131]:
# calculate test dataframe metrics with best model
model_metrics = get_metrics(model=model, df=test)
print_metrics(model_metrics)

    Evaluados: 2353
      TP Rate: 0.8440 (1986)
      FP Rate: 0.0493 (116)
      TN Rate: 0.0752 (177)
      FN Rate: 0.0314 (74)
    Accuracy: 0.9193
    Precision: 0.9448
    Recall: 0.9641
    F1: 0.9543


## Postags disponibles en spacy
Estos son los postags que incluye spacy

POS|DESCRIPTION|EXAMPLES
---|---|---
ADJ|adjective|*big, old, green, incomprehensible, first*
ADP|adposition|*in, to, during*
ADV|adverb|*very, tomorrow, down, where, there*
AUX|auxiliary|*is, has (done), will (do), should (do)*
CONJ|conjunction|*and, or, but*
CCONJ|coordinating conjunction|*and, or, but*
DET|determiner|*a, an, the*
INTJ|interjection|*psst, ouch, bravo, hello*
NOUN|noun|*girl, cat, tree, air, beauty*
NUM|numeral|*1, 2017, one, seventy-seven, IV, MMXIV*
PART|particle|*’s, not,*
PRON|pronoun|*I, you, he, she, myself, themselves, somebody*
PROPN|proper noun|*Mary, John, London, NATO, HBO*
PUNCT|punctuation|*., (, ), ?*
SCONJ|subordinating conjunction|*if, while, that*
SYM|symbol|*$, %, §, ©, +, −, ×, ÷, =, :), *
VERB|verb|*run, runs, running, eat, ate, eating*
X|other|*sfpksdpsxmsa*
SPACE|space