# Settings

## Libraries

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
from string import punctuation

import unidecode
import pickle
import nltk

import pandas as pd
import numpy as np

## Databases

In [15]:
df = pd.read_csv('imdb-reviews.csv')
df.head(4)

Unnamed: 0,id,text_en,text_pt,sentiment
0,1,Once again Mr. Costner has dragged out a movie...,"Mais uma vez, o Sr. Costner arrumou um filme p...",neg
1,2,This is an example of why the majority of acti...,Este é um exemplo do motivo pelo qual a maiori...,neg
2,3,"First of all I hate those moronic rappers, who...","Primeiro de tudo eu odeio esses raps imbecis, ...",neg
3,4,Not even the Beatles could write songs everyon...,Nem mesmo os Beatles puderam escrever músicas ...,neg


## Methods

In [16]:
def removing_irrelevant_data(text, stop_words):
    
    text = text.lower()
    phrase = ''
    
    # removing suffixes
    stemmer = nltk.RSLPStemmer()
    
    # spliting text and removing punctuation
    punctuation_remover = nltk.tokenize.WordPunctTokenizer()
    text = punctuation_remover.tokenize(text)
    
    # removing stopwords
    for word in text:
        if word not in stop_words:
            word = stemmer.stem(word)
            phrase += f'{word} '        
            
    return phrase

# Model

In [17]:
np.random.seed(42)

In [18]:
# removing accents
df['text_cleaned'] = df.text_pt.apply(lambda x: unidecode.unidecode(x))

In [19]:
# transform target in numeric values
df.sentiment = df.sentiment.replace({'pos':1, 'neg':0})

In [20]:
# removing stop words and punctuation
list_punctuation = [x for x in punctuation]
stop_words = nltk.corpus.stopwords.words('portuguese')
irrelevant_data = list_punctuation + stop_words

# As the accents were removed from the text, It should be done in the list of irrelevant too
irrelevant_data = [unidecode.unidecode(x) for x in irrelevant_data] 

# removing
df['text_cleaned'] = df['text_cleaned'].apply(removing_irrelevant_data, stop_words=irrelevant_data)

In [21]:
# calculating the relevance of all words between the context
# and applying bigrams, to better understand the importance
# of a pair of words in the following sentiment
tfidf = TfidfVectorizer(max_features=100, ngram_range = (1,2))
tfidf_vector = tfidf.fit_transform(df["text_cleaned"])

In [22]:
x_train, x_test, y_train, y_test = train_test_split(tfidf_vector, df.sentiment, test_size=0.25)

In [23]:
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)

In [24]:
lr_model.score(x_test, y_test)

0.7463000404367165

In [26]:
pickle.dump(lr_model, open('lr_sentiment_classifier.pickle', "wb"))

# Analysis

In [33]:
weights = pd.DataFrame(
    lr_model.coef_[0].T,
    index = tfidf.get_feature_names()
)

weights.nlargest(10,0)




Unnamed: 0,0
aind,1.76226
bem,1.741187
vid,1.654356
muit,1.578292
grand,1.514792
melhor,1.499808
trabalh,1.480207
gost,1.413995
mund,1.393864
emb,1.382884


In [36]:
weights.nsmallest(10, 0)

Unnamed: 0,0
ruim,-6.666034
pi,-6.306489
nad,-3.740104
nenhum,-3.250893
tent,-2.771708
parec,-2.548288
menos,-2.445971
sid,-1.907055
enred,-1.825908
ter,-1.718484
