# Data Loading and Preparation

## Loading e Análise Exploratória

In [11]:
import nltk
import csv
import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
from collections import Counter
import itertools
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Loading dos datasets como dataframes

In [2]:
train_df = pd.read_csv("imdb_reviews_train.csv", encoding="utf-8")
test_df = pd.read_csv("imdb_reviews_test.csv", encoding="utf-8")
print(train_df.head(10))
print(test_df.head(10))

                                                text label
0  This is your typical cheerful and colorful MGM...   pos
1  As a another reviewer states Hanna's War is an...   pos
2  One of the best "Amitabh comeback" movies I li...   pos
3  Peter Sollett has created an endearing portrai...   pos
4  The film is not visually stunning in the conve...   pos
5  This is not Bela Lagosi's best movie, but it's...   pos
6  I happened to watch this movie by chance some ...   pos
7  So many consider The Black Cat as the best Kar...   pos
8  I saw this at a screening last night too. I wa...   pos
9  One of the best true crime movies ever made an...   pos
                                                text label
0  Logan Lerman & Dean Collins III of Jack & Bobb...   pos
1  I have seen this film on a Sunday evening and ...   pos
2  Two great stars and a legendary Director creat...   pos
3  I'm originally from Brazil... the sad thing ab...   pos
4  "Witchery" is a decent little Euro Trash horro...   p

In [3]:
print(f"TRAIN df => {len(train_df)} reviews")
print(f"TEST df =>  {len(test_df)} reviews")

TRAIN df => 21754 reviews
TEST df =>  21996 reviews


In [4]:
sum(train_df['label']=="pos")

10776

In [7]:
print(f" TRAIN set tem {sum(train_df['label']=='pos')} reviews positivas")
print(f" TRAIN set tem {sum(train_df['label']=='neg')} reviews negativas")
print(f" TEST set tem {sum(test_df['label']=='pos')} reviews positivas")
print(f" TEST set tem {sum(test_df['label']=='neg')} reviews negativas")

 TRAIN set tem 10776 reviews positivas
 TRAIN set tem 10978 reviews negativas
 TEST set tem 10946 reviews positivas
 TEST set tem 11050 reviews negativas


In [8]:
# exemplo de review (neste caso, positiva)
train_df['text'][0]

'This is your typical cheerful and colorful MGM musical from the early \'50\'s and it\'s definitely on of the better ones to watch out there. The movie got directed by the genre expert Vincente Minnelli and stars Gene Kelly in the main lead. Both did quite a few movies together back in those days, of which this one is probably their best known one.  The movie itself actually managed to win the best picture Oscar over the year, which meant it beat out movies such as "A Place in the Sun", "A Streetcar Named Desire", "The African Queen", "Quo Vadis", "The Blue Veil", "Death of a Salesman" that year. A real accomplishment of course but at the same time also a bit too much credit for this delightful, bright and entertaining movie. When you watch this movie you surely will be entertained by it all, which is also thanks to the movie its beautiful color look and the many nice characters within this movie. The musical numbers are also all nicely done, which is no big surprise when you have peop

## Pré-processamento

Lower Case

In [9]:
# Converte todas as letras para minúsculas
train_lower = train_df['text'].apply(lambda x: x.lower())
# mesmo para o conjunto de teste
test_lower = test_df['text'].apply(lambda x: x.lower())
train_lower

Unnamed: 0,text
0,this is your typical cheerful and colorful mgm...
1,as a another reviewer states hanna's war is an...
2,"one of the best ""amitabh comeback"" movies i li..."
3,peter sollett has created an endearing portrai...
4,the film is not visually stunning in the conve...
...,...
21749,"in the third entry of the phantasm series, mik..."
21750,this movie still chills me to the bone thinkin...
21751,is this film a joke? is it a comedy? surely it...
21752,all of david prior's movies are terrible on al...


Tokenization

In [12]:
# Tokeniza com word_tokenize
train_tokens = train_lower.apply(nltk.word_tokenize)
# mesmo para o conjunto de teste
test_tokens = test_lower.apply(nltk.word_tokenize)
train_tokens

Unnamed: 0,text
0,"[this, is, your, typical, cheerful, and, color..."
1,"[as, a, another, reviewer, states, hanna, 's, ..."
2,"[one, of, the, best, ``, amitabh, comeback, ''..."
3,"[peter, sollett, has, created, an, endearing, ..."
4,"[the, film, is, not, visually, stunning, in, t..."
...,...
21749,"[in, the, third, entry, of, the, phantasm, ser..."
21750,"[this, movie, still, chills, me, to, the, bone..."
21751,"[is, this, film, a, joke, ?, is, it, a, comedy..."
21752,"[all, of, david, prior, 's, movies, are, terri..."


In [13]:
# Função para calcular a frequência de tokens
def freq(token_lists):
    all_tokens = list(itertools.chain(*token_lists))  # Unifica todas as listas em uma só (frases em uma lista)
    freq = Counter(all_tokens)  # Conta a frequência de cada token
    freq_df = pd.DataFrame(freq.items(), columns=['Token', 'Frequência']).sort_values(by="Frequência", ascending=False)
    return freq_df

Tokens mais frequentes

In [14]:
# Ainda existem muitas stopwords e pontuações
freq(train_tokens).head(20)

Unnamed: 0,Token,Frequência
10,the,221278
24,.,188548
49,",",176455
5,and,109996
42,a,109552
17,of,95082
20,to,88314
1,is,75184
14,it,69164
140,i,67269


Tratamento da negação

In [15]:
negation_words = ["not", "no", "never", "none", "nor", "without", "n't"]
sentence_endings = {".", "!", "?"}

# Função para o tratamento da negação
def negation(tokens):
    negated = False
    result = []
    # A função começa por assumir que não há negação (negated = False)
    # Percorre cada token até encontrar uma negation_words
    # Se encontrar, ativa a negação até encontrar um sentence_endings
    for token in tokens:
        if token in negation_words:
            negated = True
            result.append(token)
        elif token in sentence_endings:
            negated = False
            result.append(token)
        elif negated:
            result.append(f"NOT_{token}")
        else:
            result.append(token)

    return result


train_tokens_neg = train_tokens.apply(negation)
# mesmo para o conjunto de teste
test_tokens_neg = test_tokens.apply(negation)

In [16]:
# Exemplo de tokens com negação
train_tokens_neg[0][200:225]

['done',
 ',',
 'which',
 'is',
 'no',
 'NOT_big',
 'NOT_surprise',
 'NOT_when',
 'NOT_you',
 'NOT_have',
 'NOT_people',
 'NOT_such',
 'NOT_as',
 'NOT_vincente',
 'NOT_minnelli',
 'NOT_and',
 'NOT_gene',
 'NOT_kelly',
 'NOT_at',
 'NOT_work',
 '.',
 'but',
 'really',
 ',',
 'could']

Eliminar stopwords

In [17]:
# Lista de stopwords em inglês
stopwords = nltk.corpus.stopwords.words('english')
# Remove as stopwords e também as stopwords que foram negadas (NOT_)
train_stopw = train_tokens_neg.apply(lambda x: [
    word for word in x if word not in stopwords and not (word.startswith("NOT_") and word[4:] in stopwords)
])
# mesmo para o conjunto de teste
test_stopw = test_tokens_neg.apply(lambda x: [
    word for word in x if word not in stopwords and not (word.startswith("NOT_") and word[4:] in stopwords)
])

# MESMO MAS SEM NEGAÇÃO
train_stopw_sn = train_tokens.apply(lambda x: [
    word for word in x if word not in stopwords and not (word.startswith("NOT_") and word[4:] in stopwords)])
test_stopw_sn = test_tokens.apply(lambda x: [
    word for word in x if word not in stopwords and not (word.startswith("NOT_") and word[4:] in stopwords)])

train_stopw

Unnamed: 0,text
0,"[typical, cheerful, colorful, mgm, musical, ea..."
1,"[another, reviewer, states, hanna, 's, war, ou..."
2,"[one, best, ``, amitabh, comeback, '', movies,..."
3,"[peter, sollett, created, endearing, portrait,..."
4,"[film, NOT_visually, NOT_stunning, NOT_convent..."
...,...
21749,"[third, entry, phantasm, series, ,, mike, regg..."
21750,"[movie, still, chills, bone, thinking, ., movi..."
21751,"[film, joke, ?, comedy, ?, surely, n't, NOT_se..."
21752,"[david, prior, 's, movies, terrible, counts, :..."


In [18]:
# Mesmo excerto anterior, mas sem stopwords
train_stopw[0][115:130]

['numbers',
 'also',
 'nicely',
 'done',
 ',',
 'NOT_big',
 'NOT_surprise',
 'NOT_people',
 'NOT_vincente',
 'NOT_minnelli',
 'NOT_gene',
 'NOT_kelly',
 'NOT_work',
 '.',
 'really']

Tokens mais frequentes

In [19]:
# Ainda temos pontuação (e pontuação negada) e pedaços de palavras soltas
freq(train_stopw).head(20)

Unnamed: 0,Token,Frequência
12,.,188548
30,",",147873
7,'s,33169
13,movie,29100
114,"NOT_,",28582
92,n't,23454
130,film,23216
680,!,19186
494,(,19113
497,),18602


Eliminar pontuação

In [20]:
punctuation = '''!()-[]{};´´``:''"\,<>/?@#$%^&...*_~'''
# Remove a pontuação e também as pontuações que foram negadas (NOT_)
train_no_punct = train_stopw.apply(lambda x: [
    word for word in x if word not in punctuation and not (word.startswith("NOT_") and word[4:] in punctuation)
])
# mesmo para o conjunto de teste
test_no_punct = test_stopw.apply(lambda x: [
    word for word in x if word not in punctuation and not (word.startswith("NOT_") and word[4:] in punctuation)
])

# MESMO MAS SEM NEGAÇÃO
train_no_punct_sn = train_stopw_sn.apply(lambda x: [
    word for word in x if word not in punctuation and not (word.startswith("NOT_") and word[4:] in punctuation)])
test_no_punct_sn = test_stopw_sn.apply(lambda x: [
    word for word in x if word not in punctuation and not (word.startswith("NOT_") and word[4:] in punctuation)])

train_no_punct

Unnamed: 0,text
0,"[typical, cheerful, colorful, mgm, musical, ea..."
1,"[another, reviewer, states, hanna, 's, war, ou..."
2,"[one, best, amitabh, comeback, movies, liked, ..."
3,"[peter, sollett, created, endearing, portrait,..."
4,"[film, NOT_visually, NOT_stunning, NOT_convent..."
...,...
21749,"[third, entry, phantasm, series, mike, reggie,..."
21750,"[movie, still, chills, bone, thinking, movie, ..."
21751,"[film, joke, comedy, surely, n't, NOT_serious,..."
21752,"[david, prior, 's, movies, terrible, counts, b..."


Tokens mais frequentes

In [21]:
# ainda temos pedaços de palavras soltas (ex.'s e n't, e as mesmas negadas)
freq(train_no_punct).head(20)

Unnamed: 0,Token,Frequência
7,'s,33169
12,movie,29100
88,n't,23454
124,film,23216
29,one,14811
168,like,11167
275,good,8919
128,would,7649
213,story,7220
57,time,7104


Eliminar afixos soltos

In [22]:
loose_affixes = ["n't", "'s","'ve", "'re", "'ll", "'d", "'m", "'t"]
# Remove os afixos soltos e também os que foram negados (NOT_)
train_clean = train_no_punct.apply(lambda x: [
    word for word in x if word not in loose_affixes and not (word.startswith("NOT_") and word[4:] in loose_affixes)
])
# mesmo para o conjunto de teste
test_clean = test_no_punct.apply(lambda x: [
    word for word in x if word not in loose_affixes and not (word.startswith("NOT_") and word[4:] in loose_affixes)
])

# MESMO MAS SEM NEGAÇÃO
train_clean_sn = train_no_punct_sn.apply(lambda x: [
    word for word in x if word not in loose_affixes and not (word.startswith("NOT_") and word[4:] in loose_affixes)])
test_clean_sn = test_no_punct_sn.apply(lambda x: [
    word for word in x if word not in loose_affixes and not (word.startswith("NOT_") and word[4:] in loose_affixes)])

train_clean

Unnamed: 0,text
0,"[typical, cheerful, colorful, mgm, musical, ea..."
1,"[another, reviewer, states, hanna, war, outsta..."
2,"[one, best, amitabh, comeback, movies, liked, ..."
3,"[peter, sollett, created, endearing, portrait,..."
4,"[film, NOT_visually, NOT_stunning, NOT_convent..."
...,...
21749,"[third, entry, phantasm, series, mike, reggie,..."
21750,"[movie, still, chills, bone, thinking, movie, ..."
21751,"[film, joke, comedy, surely, NOT_serious, NOT_..."
21752,"[david, prior, movies, terrible, counts, bad, ..."


In [23]:
freq(train_clean).head(20)

Unnamed: 0,Token,Frequência
11,movie,29100
122,film,23216
28,one,14811
166,like,11167
273,good,8919
126,would,7649
211,story,7220
56,time,7104
85,really,6532
229,see,6521


In [24]:
freq(test_clean).head(20)

Unnamed: 0,Token,Frequência
66,movie,29102
93,film,23292
156,one,15186
331,like,11201
348,good,8939
273,would,7639
124,story,6984
64,time,6971
218,see,6651
71,well,6485


In [None]:
# csv
#train_clean.to_csv("train_clean.csv", index=False, encoding="utf-8")
# csv
#test_clean.to_csv("test_clean.csv", index=False, encoding="utf-8")

# MESMO MAS SEM NEGAÇÃO
#train_clean_sn.to_csv("train_clean_sn.csv", index=False, encoding="utf-8")
# csv
#test_clean_sn.to_csv("test_clean_sn.csv", index=False, encoding="utf-8")