# Data preparation

In [1]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

from collections import Counter
import itertools
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
# Ler os arquivos CSV
df_train = pd.read_csv("imdb_reviews_train.csv", quoting=0, on_bad_lines='skip')
df_test = pd.read_csv("imdb_reviews_test.csv", quoting=0, on_bad_lines='skip')
df_train.count()
df_test.count()

print("Número de linhas no conjunto de treinamento:", df_train.shape[0])
print("Número de linhas no conjunto de teste:", df_test.shape[0])

print(f" train set tem {sum(df_train['label'] == 'pos')} reviews positivas")
print(f" train set tem {sum(df_train['label'] == 'neg')} reviews negativas")

print(f" test set tem {sum(df_test['label'] == 'pos')} reviews positivas")
print(f" test set tem {sum(df_test['label'] == 'neg')} reviews negativas")

Número de linhas no conjunto de treinamento: 21754
Número de linhas no conjunto de teste: 21996
 train set tem 10776 reviews positivas
 train set tem 10978 reviews negativas
 test set tem 10946 reviews positivas
 test set tem 11050 reviews negativas


In [5]:
# Remover duplicatas com base na coluna 'review' e mantendo a primeira ocorrência
df_train = df_train.drop_duplicates(subset=['text'], keep='first')
df_test = df_test.drop_duplicates(subset=['text'], keep='first')
x = df_train.count()
y = df_test.count()

print("Número de linhas no conjunto de treinamento após remoção de duplicatas:", x)
print("Número de linhas no conjunto de teste após remoção de duplicatas:", y)

print(f" train set tem {sum(df_train['label'] == 'pos')} reviews positivas")
print(f" train set tem {sum(df_train['label'] == 'neg')} reviews negativas")

print(f" test set tem {sum(df_test['label'] == 'pos')} reviews positivas")
print(f" test set tem {sum(df_test['label'] == 'neg')} reviews negativas")


Número de linhas no conjunto de treinamento após remoção de duplicatas: text     21662
label    21662
dtype: int64
Número de linhas no conjunto de teste após remoção de duplicatas: text     21814
label    21814
dtype: int64
 train set tem 10748 reviews positivas
 train set tem 10914 reviews negativas
 test set tem 10888 reviews positivas
 test set tem 10926 reviews negativas


In [6]:
# Exemplo
df_train['text'] = df_train['text'].apply(lambda x: x.lower())
df_train['text'][:10]

df_test['text'] = df_test['text'].apply(lambda x: x.lower())
df_test['text'][:10]

Unnamed: 0,text
0,logan lerman & dean collins iii of jack & bobb...
1,i have seen this film on a sunday evening and ...
2,two great stars and a legendary director creat...
3,i'm originally from brazil... the sad thing ab...
4,"""witchery"" is a decent little euro trash horro..."
5,the best so bad it's good movie ever made. rud...
6,"okay, i'll say it. this movie made me laugh so..."
7,this movie almost has everything. the action i...
8,this movie is ageless and would probably appea...
9,"great book, great movie, great soundtrack. fra..."


Lower Case

In [7]:
# Converte todas as letras para minúsculas
train_lower = df_train['text'].apply(lambda x: x.lower())
# mesmo para o conjunto de teste
test_lower = df_test['text'].apply(lambda x: x.lower())
train_lower

Unnamed: 0,text
0,this is your typical cheerful and colorful mgm...
1,as a another reviewer states hanna's war is an...
2,"one of the best ""amitabh comeback"" movies i li..."
3,peter sollett has created an endearing portrai...
4,the film is not visually stunning in the conve...
...,...
21749,"in the third entry of the phantasm series, mik..."
21750,this movie still chills me to the bone thinkin...
21751,is this film a joke? is it a comedy? surely it...
21752,all of david prior's movies are terrible on al...


Tokenization

In [12]:
# Tokeniza com word_tokenize
train_tokens = train_lower.apply(nltk.word_tokenize)
# mesmo para o conjunto de teste
test_tokens = test_lower.apply(nltk.word_tokenize)
train_tokens

Unnamed: 0,text
0,"[this, is, your, typical, cheerful, and, color..."
1,"[as, a, another, reviewer, states, hanna, 's, ..."
2,"[one, of, the, best, ``, amitabh, comeback, ''..."
3,"[peter, sollett, has, created, an, endearing, ..."
4,"[the, film, is, not, visually, stunning, in, t..."
...,...
21749,"[in, the, third, entry, of, the, phantasm, ser..."
21750,"[this, movie, still, chills, me, to, the, bone..."
21751,"[is, this, film, a, joke, ?, is, it, a, comedy..."
21752,"[all, of, david, prior, 's, movies, are, terri..."


In [13]:
# Função para calcular a frequência de tokens
def freq(token_lists):
    all_tokens = list(itertools.chain(*token_lists))  # Unifica todas as listas em uma só (frases em uma lista)
    freq = Counter(all_tokens)  # Conta a frequência de cada token
    freq_df = pd.DataFrame(freq.items(), columns=['Token', 'Frequência']).sort_values(by="Frequência", ascending=False)
    return freq_df

In [14]:
# Ainda existem muitas stopwords e pontuações
freq(train_tokens).head(20)

Unnamed: 0,Token,Frequência
10,the,220456
24,.,187832
49,",",175789
5,and,109621
42,a,109178
17,of,94709
20,to,88011
1,is,74935
14,it,68911
140,i,66993


Lematize

In [19]:
# Lematize
lemmatizer = WordNetLemmatizer()

train_lem = train_tokens.apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])
# mesmo para o conjunto de teste
test_lem = test_tokens.apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])
train_lem


Unnamed: 0,text
0,"[this, is, your, typical, cheerful, and, color..."
1,"[a, a, another, reviewer, state, hanna, 's, wa..."
2,"[one, of, the, best, ``, amitabh, comeback, ''..."
3,"[peter, sollett, ha, created, an, endearing, p..."
4,"[the, film, is, not, visually, stunning, in, t..."
...,...
21749,"[in, the, third, entry, of, the, phantasm, ser..."
21750,"[this, movie, still, chill, me, to, the, bone,..."
21751,"[is, this, film, a, joke, ?, is, it, a, comedy..."
21752,"[all, of, david, prior, 's, movie, are, terrib..."


Negacao

In [20]:
negation_words = ["not", "no", "never", "none", "nor", "without", "n't"]
sentence_endings = {".", "!", "?"}

# Função para o tratamento da negação
def negation(tokens):
    negated = False
    result = []
    # A função começa por assumir que não há negação (negated = False)
    # Percorre cada token até encontrar uma negation_words
    # Se encontrar, ativa a negação até encontrar um sentence_endings
    for token in tokens:
        if token in negation_words:
            negated = True
            result.append(token)
        elif token in sentence_endings:
            negated = False
            result.append(token)
        elif negated:
            result.append(f"NOT_{token}")
        else:
            result.append(token)

    return result


train_tokens_neg = train_lem.apply(negation)
# mesmo para o conjunto de teste
test_tokens_neg = test_lem.apply(negation)

In [21]:
# Exemplo de tokens com negação
train_tokens_neg[0][200:225]

['done',
 ',',
 'which',
 'is',
 'no',
 'NOT_big',
 'NOT_surprise',
 'NOT_when',
 'NOT_you',
 'NOT_have',
 'NOT_people',
 'NOT_such',
 'NOT_a',
 'NOT_vincente',
 'NOT_minnelli',
 'NOT_and',
 'NOT_gene',
 'NOT_kelly',
 'NOT_at',
 'NOT_work',
 '.',
 'but',
 'really',
 ',',
 'could']

Stop words

In [22]:
# Lista de stopwords em inglês
stopwords = nltk.corpus.stopwords.words('english')
# Remove as stopwords e também as stopwords que foram negadas (NOT_)
train_stopw = train_tokens_neg.apply(lambda x: [
    word for word in x if word not in stopwords and not (word.startswith("NOT_") and word[4:] in stopwords)
])
# mesmo para o conjunto de teste
test_stopw = test_tokens_neg.apply(lambda x: [
    word for word in x if word not in stopwords and not (word.startswith("NOT_") and word[4:] in stopwords)
])

train_stopw

Unnamed: 0,text
0,"[typical, cheerful, colorful, mgm, musical, ea..."
1,"[another, reviewer, state, hanna, 's, war, out..."
2,"[one, best, ``, amitabh, comeback, '', movie, ..."
3,"[peter, sollett, ha, created, endearing, portr..."
4,"[film, NOT_visually, NOT_stunning, NOT_convent..."
...,...
21749,"[third, entry, phantasm, series, ,, mike, regg..."
21750,"[movie, still, chill, bone, thinking, ., movie..."
21751,"[film, joke, ?, comedy, ?, surely, n't, NOT_se..."
21752,"[david, prior, 's, movie, terrible, count, :, ..."


In [23]:
# Mesmo excerto anterior, mas sem stopwords
train_stopw[0][115:130]

['number',
 'also',
 'nicely',
 'done',
 ',',
 'NOT_big',
 'NOT_surprise',
 'NOT_people',
 'NOT_vincente',
 'NOT_minnelli',
 'NOT_gene',
 'NOT_kelly',
 'NOT_work',
 '.',
 'really']

Tokens + frequentes

In [24]:
# Ainda temos pontuação (e pontuação negada) e pedaços de palavras soltas
freq(train_stopw).head(20)

Unnamed: 0,Token,Frequência
12,.,187832
29,",",147306
13,movie,33917
7,'s,33042
199,wa,32043
112,"NOT_,",28483
127,film,26989
90,n't,23358
485,(,19069
664,!,18840


Eleminar pontuacao

In [25]:
punctuation = '''!()-[]{};´´``:''"\,<>/?@#$%^&...*_~'''
# Remove a pontuação e também as pontuações que foram negadas (NOT_)
train_no_punct = train_stopw.apply(lambda x: [
    word for word in x if word not in punctuation and not (word.startswith("NOT_") and word[4:] in punctuation)
])
# mesmo para o conjunto de teste
test_no_punct = test_stopw.apply(lambda x: [
    word for word in x if word not in punctuation and not (word.startswith("NOT_") and word[4:] in punctuation)
])
train_no_punct

Unnamed: 0,text
0,"[typical, cheerful, colorful, mgm, musical, ea..."
1,"[another, reviewer, state, hanna, 's, war, out..."
2,"[one, best, amitabh, comeback, movie, liked, w..."
3,"[peter, sollett, ha, created, endearing, portr..."
4,"[film, NOT_visually, NOT_stunning, NOT_convent..."
...,...
21749,"[third, entry, phantasm, series, mike, reggie,..."
21750,"[movie, still, chill, bone, thinking, movie, w..."
21751,"[film, joke, comedy, surely, n't, NOT_serious,..."
21752,"[david, prior, 's, movie, terrible, count, bad..."


Tokens + fequentes

In [26]:
# ainda temos pedaços de palavras soltas (ex.'s e n't, e as mesmas negadas)
freq(train_no_punct).head(20)

Unnamed: 0,Token,Frequência
12,movie,33917
7,'s,33042
192,wa,32043
121,film,26989
86,n't,23358
10,one,15223
164,like,11385
211,ha,9808
55,time,9082
271,good,8918


In [27]:
loose_affixes = ["n't", "'s","'ve", "'re", "'ll", "'d", "'m", "'t"]
# Remove os afixos soltos e também os que foram negados (NOT_)
train_clean = train_no_punct.apply(lambda x: [
    word for word in x if word not in loose_affixes and not (word.startswith("NOT_") and word[4:] in loose_affixes)
])
# mesmo para o conjunto de teste
test_clean = test_no_punct.apply(lambda x: [
    word for word in x if word not in loose_affixes and not (word.startswith("NOT_") and word[4:] in loose_affixes)
])

train_clean

Unnamed: 0,text
0,"[typical, cheerful, colorful, mgm, musical, ea..."
1,"[another, reviewer, state, hanna, war, outstan..."
2,"[one, best, amitabh, comeback, movie, liked, w..."
3,"[peter, sollett, ha, created, endearing, portr..."
4,"[film, NOT_visually, NOT_stunning, NOT_convent..."
...,...
21749,"[third, entry, phantasm, series, mike, reggie,..."
21750,"[movie, still, chill, bone, thinking, movie, w..."
21751,"[film, joke, comedy, surely, NOT_serious, NOT_..."
21752,"[david, prior, movie, terrible, count, bad, wr..."


In [28]:
freq(train_clean).head(20)

Unnamed: 0,Token,Frequência
11,movie,33917
190,wa,32043
119,film,26989
9,one,15223
162,like,11385
209,ha,9808
54,time,9082
269,good,8918
207,story,7836
70,character,7663


In [29]:
freq(test_clean).head(20)

Unnamed: 0,Token,Frequência
66,movie,33899
68,wa,31964
93,film,26926
154,one,15521
323,like,11374
139,ha,9845
340,good,8910
64,time,8820
125,character,7947
267,would,7568
