## Imports

In [78]:
import nltk, enum, spacy
import pandas as pd
import numpy as np
nltk.download('stopwords')

from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from spacy.lang.en import English
from spacy.tokens import Doc
from datetime import datetime
from collections import defaultdict

[nltk_data] Downloading package stopwords to
[nltk_data]     /data/home/kegarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Cargar modelos de spacy
Se agregó como hiperparámetro el modelo de spacy, por lo que se cargaron 4 modelos diferentes

In [79]:
STOPWORDS = set(stopwords.words('english'))
ALLOWED_POSTAGS = ['NOUN', 'ADJ', 'VERB', 'ADV', 'PART']
NLP_MODELS = {
    'en_core_web_trf': spacy.load('en_core_web_trf'),    
    'en_core_web_sm': spacy.load('en_core_web_sm'),
    'en_core_web_md': spacy.load('en_core_web_md'),
    'en_core_web_lg': spacy.load('en_core_web_lg')
}

class Sentiments(enum.Enum):
    POS = 'POS'
    NEG = 'NEG'

## Model class

In [81]:
class Model:
    # not necesarry but just as a 'fyi'
    raw_data = pd.DataFrame() # constructor
    data_classes = {
        'POS': {
            'sentences': [] # array of strings
            , 'words': [] # array of arrays, each array contains each sentence splitted
            , 'words_without_stopwords': [] # same as words but without stopwords
            , 'words_1d': [] # 1d array of words
            , 'lemma': []
            , 'bow': None
            , 'ggram': None
        }
        , 'NEG': {}
    }
    stopwords = STOPWORDS # default if not given
    allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
    
    def __init__(self, df, steps, nlp_model = 'en_core_web_lg', stopwords=STOPWORDS, ngrams=2, min_count=5, threshold=10, allowed_postags=ALLOWED_POSTAGS, debug = False):
        self.stopwords = stopwords
        self.raw_data = df
        self.ngram = {'min_count': min_count, 'threshold': threshold, 'ngrams': ngrams}
        self.steps = steps
        self.allowed_postags = allowed_postags
        self.debug = debug
        if nlp_model not in NLP_MODELS:
            nlp_model = 'en_core_web_lg'
        self.nlp = NLP_MODELS[nlp_model]
    
    #def __repr__(self):
    #    return self.__str__()
    
    #def __str__(self):
    #    ','.join(self.steps)
    #    return "member of Test"
    
    def fit(self):
        # get from raw data a df with data_classes = {'POS':{'sentences':[.. , ..]}, 'NEG':{'sentences':[.. , ..]}}
        self.data_classes = {sentiment.value: {'sentences': self.raw_data[self.raw_data['sentiment'] == sentiment.value]['review'].values.tolist()} for sentiment in Sentiments}
        
        # iterate for each sentiment class
        for sentiment in self.data_classes.keys():
            print(f'{datetime.now()} {sentiment} class start')
            
            # get class percentage (ie: percentage = POS/total sentences)
            percentage = len(self.data_classes[sentiment]['sentences'])/len(self.raw_data.index)
            self.data_classes[sentiment]['percentage'] = percentage
            
            # split sentences into array of words
            print(f'{datetime.now()}   sentences_as_words')
            sentences = self.data_classes[sentiment]['sentences']
            result = self.sentences2words(sentences)
            
            if (self.debug):
                self.data_classes[sentiment]['sentences_as_words'] = result
            
            # iterate the pipeline
            for step in self.steps:
                print(f'{datetime.now()}   {step}')
                
                if step == 'remove_stopwords':
                    result = self.remove_stopwords(result)
                    if (self.debug):
                        self.data_classes[sentiment]['words_without_stopwords'] = result
                
                elif step == 'lemmatization':
                    result = self.lemmatization(result)
                    if (self.debug):
                        self.data_classes[sentiment]['lemmas'] = result
                
                elif step == 'ngram':
                    # train ngram model
                    ngram_model = self.train_ngrams(result, ngrams=self.ngram['ngrams'], min_count=self.ngram['min_count'], threshold=self.ngram['threshold'])
                    
                    if len(ngram_model)>0:
                        self.data_classes[sentiment]['ngram_model'] = ngram_model
                        
                        # apply ngram model
                        result = self.create_ngrams(ngram_model, result)
                        if (self.debug):
                            self.data_classes[sentiment]['ngrams'] = result
                    else:
                        print(f'{datetime.now()} ngram not done: {self.ngram["ngram"]}')
                
                else:
                    print(f'{datetime.now()} instruction not found: {step}')
            
            # save all words per sentiment class
            words = self.array2dto1d(result)
            self.data_classes[sentiment]['words'] = words
            
            # if not debug remove sentences 
            if (not self.debug):
                del self.data_classes[sentiment]['sentences']
            
        print(f'{datetime.now()} probs')
        
        # build word dictionary of all sentiment classes
        all_words = []
        for sentiment in self.data_classes.keys():
            all_words.append(self.data_classes[sentiment]['words'])
        self.dictionary = Dictionary(all_words)
        
        # calculate word probabilities for each sentiment class
        for sentiment in self.data_classes.keys():
            # calculate bag of words
            self.data_classes[sentiment]['bow'] = self.dictionary.doc2bow(self.data_classes[sentiment]['words'])
            
            # calculate probability of each word
            self.data_classes[sentiment]['total_length'] = len(self.data_classes[sentiment]['words']) + len(self.dictionary)
            word_probs = defaultdict(lambda: np.log(1/self.data_classes[sentiment]['total_length'])) # default value
            for id, count in self.data_classes[sentiment]['bow']:
                word_probs[self.dictionary[id]] = np.log((count + 1)/self.data_classes[sentiment]['total_length']) # {'word': prob}
            self.data_classes[sentiment]['word_probs'] = word_probs

            # if not debug, remove words
            if (not self.debug):
                del self.data_classes[sentiment]['words']
                del self.data_classes[sentiment]['bow']
        print(f'{datetime.now()} end')
        
    def predict_list(self, list_of_sentences, debug=False):
        results = []
        for sentence in list_of_sentences:
            results.append({'sentence': sentence, **self.predict(sentence, debug)})
        return results
    
    
    def predict(self, sentence, debug=False):
        sentences = [sentence]
        probs = {}
        selected = None
        current_value_selected = float('-inf')
        
        # iterate for each sentiment class
        for sentiment in self.data_classes.keys():
            result = self.sentences2words(sentences)
            
            # apply pipeline to sentence
            for step in self.steps:
                if debug:
                    print(f'{datetime.now()} {step}')
                if step == 'remove_stopwords':
                    result = self.remove_stopwords(result)
                elif step == 'lemmatization':
                    result = self.lemmatization(result)
                elif step == 'ngram':
                    if 'ngram_model' in self.data_classes[sentiment] and len(self.data_classes[sentiment]['ngram_model'])>0:                    
                        result = self.create_ngrams(self.data_classes[sentiment]['ngram_model'], result)
                    else:
                        print(f'no ngram model found for {sentiment}')
                else:
                    print(f'instruction not found: {step}')
                if debug:
                    print(f'{datetime.now()} {result}')
            
            # calculate sentiment probability
            prob_values = []
            for one_row in result: # remember we added the sentence to an array
                for word in one_row:
                    prob_values.append(self.data_classes[sentiment]['word_probs'][word])
            prob_values.append(np.log(self.data_classes[sentiment]['percentage']))
            probs[sentiment] = {'prob': sum(prob_values), 'probs': prob_values}
            if (probs[sentiment]['prob'] > current_value_selected):
                current_value_selected = probs[sentiment]['prob'] 
                selected = sentiment
        probs['selected'] = selected
        return probs           
    
    def sentences2words(self, sentences):
        """
        receives a list of strings (sentences) ['hello world', 'test, sentence!'] 
        and returns for each sentence a split of its words: [['hello','world'], ['test','sentence']]
        using gensim simple_preprocess function
        """
        words = []
        for sentence in sentences:
            words.append(simple_preprocess(sentence, deacc=True))
        return words
        
    def remove_stopwords(self, list_of_list_of_words):
        """
        receives a list of list of words [['abc', 'abc', ...], ...]
        """
        words = []
        for sentence_as_words in list_of_list_of_words:
            words.append([word for word in sentence_as_words if word not in self.stopwords])
        return words
            
    def lemmatization(self, list_of_list_of_words):
        """
        receives a list of list of words [['swimming','after','playing']]
        and returns the same list with each words lemma: [['swim','after','play']]
        """
        words = []        
        for sentence_as_words in list_of_list_of_words:
            doc = self.nlp(' '.join(sentence_as_words))
            words.append([token.lemma_ for token in doc if token.pos_ in self.allowed_postags ])
        return words
    
    def array2dto1d(self, array2d):
        """
        receives a list of list of words [['hello','world'],['test']] 
        and returns in a single list ['hello','world','test']
        """
        result = []
        for array1d in array2d:
            result.extend(array1d)
        return result
            
    def train_ngrams(self, list_of_list_of_words, ngrams=2, min_count=5, threshold=10):
        if ngrams < 2:
            ngrams = 2
        result = list_of_list_of_words
        ngram_models = []
        for i in range(ngrams-1):
            ngram_phraser = Phrases(result, min_count=min_count, threshold=threshold)
            ngram_model = Phraser(ngram_phraser)
            ngram_models.append(ngram_model)
            result = list(ngram_model[result])
            
        return ngram_models
    
    def create_ngrams(self, ngram_model_array, list_of_list_of_words):
        """ngram_model = []"""
        result = list_of_list_of_words
        for ngram_model in ngram_model_array:
            result = list(ngram_model[result])
        return result
        
        #dictionary.doc2idx(['abysmal', 'abuse'])
        

## Función para obtener las métricas de un modelo
Obtiene todas las métricas al evaluar sobre un dataframe. El dataframe debe tener las columnas review y sentiment. Calcula:
- **evaluated**: La cantidad de registros evaluados
- **tp_rate**: Ratio de Verdaderos Positivos
- **tn_rate**: Ratio de Verdaderos Negativos
- **fp_rate**: Ratio de Falsos Positivos
- **fn_rate**: Ratio de Falsos Negativos
- **accuracy**: (tp + tn)/(tp+fp+fn+tn)
- **precision**: tp / (tp + fp)
- **recall**: tp / (tp + fn)
- **f1**: (2 * precision * recall)/(precision + recall)

In [82]:
def get_metrics(model, df):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    predicted_list = []
    for index, row in df.iterrows():
        review = row['review']
        original = row['sentiment']
        
        # predict
        predicted = model.predict(review)['selected']
        predicted_list.append(predicted)
    
        # calculate metrics
        if predicted==Sentiments.POS.value and original==Sentiments.POS.value:
            tp += 1
        elif predicted==Sentiments.NEG.value and original==Sentiments.NEG.value:
            tn += 1
        elif predicted==Sentiments.POS.value and original==Sentiments.NEG.value:
            fp += 1
        elif predicted==Sentiments.NEG.value and original==Sentiments.POS.value:
            fn += 1
    
    # calculate final metrics
    accuracy = (tp + tn)/(tp+fp+fn+tn) if (tp+fp+fn+tn)>0 else 0
    precision = tp / (tp + fp) if (tp+fp) > 0 else -1
    recall = tp / (tp + fn) if (tp + fn) > 0 else -1
    f1 = 2*precision*recall/(precision + recall) if (precision + recall > 0) else -1
    
    return {
        'evaluated':len(predicted_list),
        'tp_rate':tp/len(predicted_list),
        'tn_rate':tn/len(predicted_list),
        'fp_rate':fp/len(predicted_list),
        'fn_rate':fn/len(predicted_list),
        'accuracy':accuracy,
        'precision':precision,
        'recall':recall,
        'f1':f1,
        'results':predicted_list
    }

## Carga de datos para entrenar, validar y probar el modelo

In [83]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
validation = pd.read_csv('data/validation.csv')

In [84]:
train

Unnamed: 0,rating,review,sentiment
0,50,This is one of the best hotels I've ever staye...,POS
1,50,Everything about this hotel was awesome. The s...,POS
2,50,Our tour group stayed here for two nights. Th...,POS
3,50,Excellent service at Porta Hotel Antigua. From...,POS
4,50,I almost always stay at Hotel Antigua when I t...,POS
...,...,...,...
10972,50,I was there with a Belize delegation of about ...,POS
10973,40,Last week I stayed at the Camino Real in Antig...,POS
10974,50,My boyfriend was in Guate on business and we d...,POS
10975,40,I stayed at Camino Real Antigua for a conferen...,POS


In [85]:
test

Unnamed: 0,rating,review,sentiment
0,50,I would definitely stay here again in Antigua....,POS
1,40,"Great location, in the heart of historic centr...",POS
2,50,"Not only the place is nice, clean and in an ex...",POS
3,50,We spent two nights and I wish we could have s...,POS
4,50,I just recently returned from Antigua and my s...,POS
...,...,...,...
2348,50,We didn't have much of a plan upon arriving to...,POS
2349,40,This hotel was very close to the parque centra...,POS
2350,50,"good experience i highly recommend, the food...",POS
2351,50,Centrally located....12 small rooms. Would vou...,POS


In [86]:
validation

Unnamed: 0,rating,review,sentiment
0,50,Cucuruchos is a great place to spend your time...,POS
1,30,We booked this place through Booking for one n...,NEG
2,30,An odd mix of positives and negatives : +ves h...,NEG
3,30,We had been recommended this hostel because of...,NEG
4,20,I knew this place is right underneath the Sky ...,NEG
...,...,...,...
2347,50,Very attractive rooms and grounds. The outside...,POS
2348,50,SIMPLY - Very beautiful - extremely clean - qu...,POS
2349,50,We stayed here for 3 nights and 1 night in the...,POS
2350,40,This is a very nice boutique hotel. Staff is ...,POS


## Ejemplo modelo simple
Se entrena un modelo simple para hacer pruebas del flujo completo

In [87]:
model_test = Model(
    df = train
    , steps = ['remove_stopwords','ngram']
    , nlp_model = 'en_core_web_sm'
    , ngrams = 2
    , min_count = 5
    , threshold = 10
    , allowed_postags = ALLOWED_POSTAGS
    , debug = False
)
model_test.fit()

2022-07-02 23:14:43.827264 POS class start
2022-07-02 23:14:43.827380   sentences_as_words
2022-07-02 23:14:47.634183   remove_stopwords
2022-07-02 23:14:47.834587   ngram
2022-07-02 23:14:51.280868 NEG class start
2022-07-02 23:14:51.281280   sentences_as_words
2022-07-02 23:14:51.928537   remove_stopwords
2022-07-02 23:14:51.963815   ngram
2022-07-02 23:14:52.597238 probs
2022-07-02 23:14:53.322333 end


**Sanity check** de un caso negativo y uno positivo

In [88]:
result = model_test.predict("the hotel was dirty and noisy")
print(result)
print(f'>> Selected class {result["selected"]}')

{'POS': {'prob': -21.14690692178164, 'probs': [-3.915511649867557, -9.478052593473269, -7.623772767263879, -0.1295699111769321]}, 'NEG': {'prob': -19.56974651206995, 'probs': [-4.28513024399396, -6.872302738240207, -6.304693302624039, -2.1076202272117475]}, 'selected': 'NEG'}
>> Selected class NEG


In [89]:
result = model_test.predict("the hotel was very clean and I love it! :)")
print(result)
print(f'>> Selected class {result["selected"]}')

{'POS': {'prob': -15.605211573416558, 'probs': [-3.915511649867557, -4.785817268499889, -6.77431274387218, -0.1295699111769321]}, 'NEG': {'prob': -20.27825475622301, 'probs': [-4.28513024399396, -5.520297450598949, -8.365206834418355, -2.1076202272117475]}, 'selected': 'POS'}
>> Selected class POS


Probar con la data de validación para **obtener las métricas**

In [90]:
model_metrics = get_metrics(model=model_test, df=validation)
print('evaluados: '+'%.0f'%model_metrics['evaluated'])
print('tp_rate: '+'%.4f'%model_metrics['tp_rate'] + ' (%.0f)'%(model_metrics["evaluated"]*model_metrics["tp_rate"]))
print('tn_rate: '+'%.4f'%model_metrics['tn_rate'] + ' (%.0f)'%(model_metrics["evaluated"]*model_metrics["tn_rate"]))
print('fp_rate: '+'%.4f'%model_metrics['fp_rate'] + ' (%.0f)'%(model_metrics["evaluated"]*model_metrics["fp_rate"]))
print('fn_rate: '+'%.4f'%model_metrics['fn_rate'] + ' (%.0f)'%(model_metrics["evaluated"]*model_metrics["fn_rate"]))
print('accuracy: '+'%.4f'%model_metrics['accuracy'])
print('precision: '+'%.4f'%model_metrics['precision'])
print('recall: '+'%.4f'%model_metrics['recall'])
print('f1: '+'%.4f'%model_metrics['f1'])

evaluados: 2352
tp_rate: 0.8474 (1993)
tn_rate: 0.0595 (140)
fp_rate: 0.0753 (177)
fn_rate: 0.0179 (42)
accuracy: 0.9069
precision: 0.9184
recall: 0.9794
f1: 0.9479


## Definir todos los modelos a entrenar
Se creó un dataframe para incluir todos los modelos con los que se desea entrenar para hacer pruebas

In [91]:
models_to_train = []

# modelos solo con stopwords y bigramas, variando los conteos de minimos y threshold
models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 1, 'threshold': 5
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 3, 'threshold': 5
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 10
    , 'nlp_model': None, 'allowed_postags': None
})

models_to_train.append({
    'steps': ['remove_stopwords','ngram']
    , 'ngrams': 2, 'min_count': 5, 'threshold': 20
    , 'nlp_model': None, 'allowed_postags': None
})

# modelos con stopwords y trigramas
#models_to_train.append({
#    'steps': ['remove_stopwords','ngram']
#    , 'ngrams': 2, 'min_count': 5, 'threshold': 20
#    , 'nlp_model': 'en_core_web_sm', 'allowed_postags': ALLOWED_POSTAGS
#})



In [92]:
model_results = []
for model_to_train in models_to_train:
    print('*******************************************')
    print(f'Start training model {len(model_results)+1}/{len(models_to_train)}')
    print('*******************************************')
    
    # build model
    model = Model(
        df = train
        , steps = model_to_train['steps']
        , nlp_model = model_to_train['nlp_model']
        , ngrams = model_to_train['ngrams']
        , min_count = model_to_train['min_count']
        , threshold = model_to_train['threshold']
        , allowed_postags = model_to_train['allowed_postags']
        , debug = False
    )
    
    # train model
    model.fit()
    model_to_train['model'] = model
    
    # test model with validation dataframe
    model_metrics = get_metrics(model=model, df=validation)
    print('Metrics:',model_metrics)
    
    # append results
    model_results.append({**model_to_train, **model_metrics})



*******************************************
Start training model 1/4
*******************************************
2022-07-02 23:15:48.832013 POS class start
2022-07-02 23:15:48.832105   sentences_as_words
2022-07-02 23:15:52.586789   remove_stopwords
2022-07-02 23:15:52.792113   ngram
2022-07-02 23:15:56.187040 NEG class start
2022-07-02 23:15:56.187260   sentences_as_words
2022-07-02 23:15:56.826407   remove_stopwords
2022-07-02 23:15:56.860788   ngram
2022-07-02 23:15:57.476022 probs
2022-07-02 23:15:58.379255 end
Metrics: {'evaluated': 2352, 'tp_rate': 0.8554421768707483, 'tn_rate': 0.044642857142857144, 'fp_rate': 0.09013605442176871, 'fn_rate': 0.00977891156462585, 'accuracy': 0.9000850340136054, 'precision': 0.9046762589928058, 'recall': 0.9886977886977887, 'f1': 0.9448227283399859, 'results': ['POS', 'NEG', 'NEG', 'POS', 'NEG', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'NEG', 'POS', 'POS', 'POS', 'POS', 'POS', 

2022-07-02 23:16:05.915112   remove_stopwords
2022-07-02 23:16:06.135082   ngram
2022-07-02 23:16:09.563842 NEG class start
2022-07-02 23:16:09.564100   sentences_as_words
2022-07-02 23:16:10.219828   remove_stopwords
2022-07-02 23:16:10.254553   ngram
2022-07-02 23:16:10.887861 probs
2022-07-02 23:16:11.629407 end
Metrics: {'evaluated': 2352, 'tp_rate': 0.8520408163265306, 'tn_rate': 0.0548469387755102, 'fp_rate': 0.07993197278911565, 'fn_rate': 0.013180272108843538, 'accuracy': 0.9068877551020408, 'precision': 0.9142335766423357, 'recall': 0.9847665847665847, 'f1': 0.9481902058197302, 'results': ['POS', 'POS', 'NEG', 'POS', 'NEG', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'NEG', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'NEG', 'POS', 'POS', 'POS', 'POS', 'P

2022-07-02 23:16:18.755800   remove_stopwords
2022-07-02 23:16:18.996009   ngram
2022-07-02 23:16:22.456080 NEG class start
2022-07-02 23:16:22.456286   sentences_as_words
2022-07-02 23:16:23.097450   remove_stopwords
2022-07-02 23:16:23.132875   ngram
2022-07-02 23:16:23.779405 probs
2022-07-02 23:16:24.524632 end
Metrics: {'evaluated': 2352, 'tp_rate': 0.8473639455782312, 'tn_rate': 0.05952380952380952, 'fp_rate': 0.07525510204081633, 'fn_rate': 0.017857142857142856, 'accuracy': 0.9068877551020408, 'precision': 0.9184331797235024, 'recall': 0.9793611793611794, 'f1': 0.9479191438763377, 'results': ['POS', 'POS', 'NEG', 'POS', 'NEG', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'NEG', 'POS', 'POS', 'POS', 'NEG', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'NEG', 'POS', 'POS', 'POS', 'POS', '

2022-07-02 23:16:31.677334   remove_stopwords
2022-07-02 23:16:31.895546   ngram
2022-07-02 23:16:35.591673 NEG class start
2022-07-02 23:16:35.591908   sentences_as_words
2022-07-02 23:16:36.235254   remove_stopwords
2022-07-02 23:16:36.271365   ngram
2022-07-02 23:16:36.922469 probs
2022-07-02 23:16:37.674948 end
Metrics: {'evaluated': 2352, 'tp_rate': 0.8456632653061225, 'tn_rate': 0.06462585034013606, 'fp_rate': 0.07015306122448979, 'fn_rate': 0.0195578231292517, 'accuracy': 0.9102891156462585, 'precision': 0.9233983286908078, 'recall': 0.9773955773955774, 'f1': 0.9496299832895679, 'results': ['POS', 'POS', 'NEG', 'POS', 'NEG', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'NEG', 'POS', 'POS', 'POS', 'NEG', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'POS', 'NEG', 'POS', 'POS', 'POS', 'POS', 'PO

In [93]:
df_results = pd.DataFrame(model_results)
df_results

Unnamed: 0,steps,ngrams,min_count,threshold,nlp_model,allowed_postags,model,evaluated,tp_rate,tn_rate,fp_rate,fn_rate,accuracy,precision,recall,f1,results
0,"[remove_stopwords, ngram]",2,1,5,,,<__main__.Model object at 0x7f3c6eaf8978>,2352,0.855442,0.044643,0.090136,0.009779,0.900085,0.904676,0.988698,0.944823,"[POS, NEG, NEG, POS, NEG, POS, POS, POS, POS, ..."
1,"[remove_stopwords, ngram]",2,3,5,,,<__main__.Model object at 0x7f3c77f774e0>,2352,0.852041,0.054847,0.079932,0.01318,0.906888,0.914234,0.984767,0.94819,"[POS, POS, NEG, POS, NEG, POS, POS, POS, POS, ..."
2,"[remove_stopwords, ngram]",2,5,10,,,<__main__.Model object at 0x7f3c6faf7208>,2352,0.847364,0.059524,0.075255,0.017857,0.906888,0.918433,0.979361,0.947919,"[POS, POS, NEG, POS, NEG, POS, POS, POS, POS, ..."
3,"[remove_stopwords, ngram]",2,5,20,,,<__main__.Model object at 0x7f3c6fa5d048>,2352,0.845663,0.064626,0.070153,0.019558,0.910289,0.923398,0.977396,0.94963,"[POS, POS, NEG, POS, NEG, POS, POS, POS, POS, ..."


In [77]:
{**a, **b}

{'a': 1, 'b': 2}

In [18]:
train_model.data_classes # ['POS']['ngrams']

{'POS': {'percentage': 0.8784731711760955,
  'ngram_model': [<gensim.models.phrases.FrozenPhrases at 0x7f3ca137b3c8>],
  'total_length': 352979,
  'word_probs': defaultdict(<function __main__.Model.fit.<locals>.<lambda>()>,
              {'aa': -12.081016663500897,
               'aaa': -12.081016663500897,
               'aat': -12.081016663500897,
               'abandon': -11.387869482940951,
               'abbey': -12.081016663500897,
               'abhor': -12.081016663500897,
               'abide': -12.081016663500897,
               'ability': -10.135106514445583,
               'abit': -12.081016663500897,
               'ablaze': -12.081016663500897,
               'able': -7.19443401807462,
               'ably': -11.675551555392733,
               'aborigine': -12.081016663500897,
               'abound': -10.471578751066797,
               'abreast': -12.081016663500897,
               'abroad': -10.376268571262472,
               'absence': -11.675551555392733,
        

In [12]:
[token.lemma_ for token in NLP_MODELS['en_core_web_lg']('hello world')  if token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV'] ]

['world']

In [5]:
simple_preprocess("hello world", deacc=True)

['hello', 'world']

In [57]:
model_metrics = get_metrics(model=train_model, df=validation)

print('tp_rate: '+'%.4f'%model_metrics['tp_rate'])
print('tn_rate: '+'%.4f'%model_metrics['tn_rate'])
print('fp_rate: '+'%.4f'%model_metrics['fp_rate'])
print('fn_rate: '+'%.4f'%model_metrics['fn_rate'])
print('accuracy: '+'%.4f'%model_metrics['accuracy'])
print('precision: '+'%.4f'%model_metrics['precision'])
print('recall: '+'%.4f'%model_metrics['recall'])
print('f1: '+'%.4f'%model_metrics['f1'])

tp_rate: 0.8435
tn_rate: 0.0685
fp_rate: 0.0663
fn_rate: 0.0217
accuracy: 0.9120
precision: 0.9271
recall: 0.9749
f1: 0.9504


## Postags disponibles en spacy
Se agregó el listado de postags como hiperparámetro

POS|DESCRIPTION|EXAMPLES
---|---|---
ADJ|adjective|*big, old, green, incomprehensible, first*
ADP|adposition|*in, to, during*
ADV|adverb|*very, tomorrow, down, where, there*
AUX|auxiliary|*is, has (done), will (do), should (do)*
CONJ|conjunction|*and, or, but*
CCONJ|coordinating conjunction|*and, or, but*
DET|determiner|*a, an, the*
INTJ|interjection|*psst, ouch, bravo, hello*
NOUN|noun|*girl, cat, tree, air, beauty*
NUM|numeral|*1, 2017, one, seventy-seven, IV, MMXIV*
PART|particle|*’s, not,*
PRON|pronoun|*I, you, he, she, myself, themselves, somebody*
PROPN|proper noun|*Mary, John, London, NATO, HBO*
PUNCT|punctuation|*., (, ), ?*
SCONJ|subordinating conjunction|*if, while, that*
SYM|symbol|*$, %, §, ©, +, −, ×, ÷, =, :), *
VERB|verb|*run, runs, running, eat, ate, eating*
X|other|*sfpksdpsxmsa*
SPACE|space