# Class to predict sentiment

The file to be used in the class must have the following format:

![info_csv](foto_info_csv.PNG)

- Be a .csv file
- a single column

In [1]:
import pandas as pd
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras_preprocessing.text import tokenizer_from_json
import json
import re
import numpy as np
import pickle

class model_sentiment_test():
    """Class that allows automated predictions
         in the text.
       
         args:
         ----------
         file_text (str): .csv text file
        
         """
    def __init__(self, file_text:str):
        self.file_text = file_text
        self.predictions = None        #model predictions
        self.df = None                 #raw data frame
        self.df_clean = None           #processed data frame
        self.df_result = None          #dataframe with predictions
        
        if ".csv" in file_text:
            self.__read_pandas()
        else:
            print("The file does not have a .csv extension")
        
    def __read_pandas(self):
        """Load the data into a dataframe"""     
        self.df = pd.read_csv(self.file_text).iloc[:,0]
    
    @staticmethod
    def __remove_emojis(data):
        """Remove emojis from messages"""
        
        emoj = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols and pictograms
            u"\U0001F680-\U0001F6FF"  # transport and map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese symbols
                      "]+", re.UNICODE)
        return re.sub(emoj, '', data)
    
    @staticmethod
    def __tokenizar_stopwords(text, *stop_words):
        """Remove stopwords"""
        output = [word for word in text.split() if word not in stop_words]        
        return ' '.join(output)
    
    def __treatment(self):
        """text treatment"""
        df = self.df.copy()
        
        #Remove accents
        a,b = 'áéíóúüÁÉÍÓÚÜ','aeiouuAEIOUU'
        trans = str.maketrans(a,b)
        df = df.str.translate(trans)
        print("Processing ... Step [1/13]", end = "\r")
        
        #Remove line breaks and tabs
        df = df.str.replace("[\n,\t]", ' ', regex=True)
        print("Processing ... Step [2/13]", end = "\r")
        
        #Remove tags, links and numbers
        df = df.str.replace("(@[A-Za-z0-9\_\-\.]+)|(\w+:\/\/\S+)|(\d+[\w+\-\/]*)", "", regex=True) 
        print("Processing ... Step [3/13]", end = "\r")
        
        #Remove special characters
        df = df.str.replace("[#,&,$,!,',),(,-,*,;,:,|,\",.,?,¿,¡]",'',regex=True)
        print("Processing ... Paso [4/13]", end = "\r")
        
        #Remove emoji
        df = df.apply(self.__remove_emojis)
        print("Processing ... Paso [5/13]", end = "\r")
        
        #Remove spaces at the beginning and end
        df = df.str.strip()
        print("Processing ... Paso [6/13]", end = "\r")
        
        #Transform to lowercase
        df = df.str.lower()
        print("Processing ... Paso [7/13]", end = "\r")
        
        #Remove duplicates where both attributes are the same
        df = df[~df.duplicated()]
        print("Processing ... Paso [8/13]", end = "\r")
        
        #Remove stopwords
        with open("stopwords.pickle", "rb") as f:
            stop_words = pickle.load(f)
        df = df.apply(self.__tokenizar_stopwords, args=stop_words)
        print("Processing ... Paso [9/13]", end = "\r")
        
        #Remove empty cells
        df.replace('', np.nan, inplace=True)
        df.dropna(inplace=True)
        print("Processing ... Paso [10/13]", end = "\r")
        
        self.df_clean = df.copy()
        
    def evaluation(self):
        """Predict the sentiment from the text"""
        self.__treatment()
        
        X = self.df_clean.values
        
        #Change text according to the sequence of values of the dictionary
        with open('tokenizer.json') as f:
            data = json.load(f)
            tokenizer = tokenizer_from_json(data)
        X = tokenizer.texts_to_sequences(X)
        print("Processing ... Paso [11/13]", end = "\r")
        
        #Set sequence input length
        X = pad_sequences(X, 40)
        print("Processing ... Paso [12/13]", end = "\r")
        
        #Change input form
        X_ = X.reshape(X.shape[0],X.shape[1],1)
        print("Processing ... Paso [13/13]", end = "\r")
        
        #Model loaded
        model = keras.models.load_model('best_model.hdf5')
        print("Predicting ... ... ... ... ...", end = "\r")
        
        #Calculation of predictions
        predictions = model.predict(X_).reshape(1,-1)[0]
        self.predictions = [0 if x < 0.5 else 1 for x in predictions]
        
        self.df_result = self.df.iloc[self.df_clean.index].to_frame()
        self.df_result['sentimiento_real_pred'] = self.predictions
        print("Ended process ... ... ... ...", end = "\r")

## Example

In [2]:
# Model instance and data load
sentiment = model_sentiment_test('data_without_target.csv')

In [3]:
# Data processing and prediction
sentiment.evaluation()

Ended process ... ... ... ....

In [4]:
# Model output
sentiment.df_result

Unnamed: 0,text,sentimiento_real_pred
0,@Vladi_VillegasP @YouTube @laidygomezf una co...,0
1,@Vladialacarta @laidygomezf yo estoy Orgullos...,1
2,"#SoyComunicadorClap \n#Táchira: \n""Hasta el ir...",0
3,"@VTVcanal8 La ""Reina del Tachira"" Protectora y...",0
4,Un buen espaldarazo a la institucionalidad ser...,0
...,...,...
114259,También fui afectada con el ROBO que nos hizo ...,1
114260,Estoy intentando hacer un reclamo desde el.dia...,1
114261,Ajá y el día de los muertos lo agarraran de pu...,1
114262,tengo problemas con la lectura de huella deseo...,1


In [5]:
preds = sentiment.predictions
print("Primeras 5 predicciones {}".format(preds[:5]))

Primeras 5 predicciones [0, 1, 0, 0, 0]
