# Imports

In [None]:
import numpy as np
import pandas as pd

import tensorflow_datasets as tfds

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import pickle

  from .autonotebook import tqdm as notebook_tqdm
2025-03-29 15:47:05.840484: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-29 15:47:05.860482: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pitangura/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Exploring and cleaning the set

In [2]:
data = tfds.load('imdb_reviews')

In [3]:
# Convert each split into a Pandas DataFrame
dfs = {}
for split, dataset in data.items():
    data_list = [(example['text'].numpy().decode('utf-8'), example['label'].numpy()) for example in dataset]
    dfs[split] = pd.DataFrame(data_list, columns=['review', 'label'])

2025-03-29 15:47:08.400305: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-03-29 15:47:09.649861: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [4]:
dfs

{'train':                                                   review  label
 0      This was an absolutely terrible movie. Don't b...      0
 1      I have been known to fall asleep during films,...      0
 2      Mann photographs the Alberta Rocky Mountains i...      0
 3      This is the kind of film for a snowy Sunday af...      1
 4      As others have mentioned, all the women that g...      1
 ...                                                  ...    ...
 24995  I have a severe problem with this show, severa...      0
 24996  The year is 1964. Ernesto "Che" Guevara, havin...      1
 24997  Okay. So I just got back. Before I start my re...      0
 24998  When I saw this trailer on TV I was surprised....      0
 24999  First of all, Riget is wonderful. Good comedy ...      1
 
 [25000 rows x 2 columns],
 'test':                                                   review  label
 0      There are films that make careers. For George ...      1
 1      A blackly comic tale of a down-trodd

In [5]:
df_train = dfs['train']
df_test = dfs['test']

In [6]:
df_train

Unnamed: 0,review,label
0,This was an absolutely terrible movie. Don't b...,0
1,"I have been known to fall asleep during films,...",0
2,Mann photographs the Alberta Rocky Mountains i...,0
3,This is the kind of film for a snowy Sunday af...,1
4,"As others have mentioned, all the women that g...",1
...,...,...
24995,"I have a severe problem with this show, severa...",0
24996,"The year is 1964. Ernesto ""Che"" Guevara, havin...",1
24997,Okay. So I just got back. Before I start my re...,0
24998,When I saw this trailer on TV I was surprised....,0


In [7]:
df_test

Unnamed: 0,review,label
0,There are films that make careers. For George ...,1
1,"A blackly comic tale of a down-trodden priest,...",1
2,"Scary Movie 1-4, Epic Movie, Date Movie, Meet ...",0
3,Poor Shirley MacLaine tries hard to lend some ...,0
4,As a former Erasmus student I enjoyed this fil...,1
...,...,...
24995,"Feeling Minnesota is not really a road movie, ...",0
24996,"This is, without doubt, one of my favourite ho...",1
24997,Most predicable movie I've ever seen...extreme...,0
24998,It's exactly what I expected from it. Relaxing...,1


Setting everything to lower:

In [8]:
def set_to_lower(_df):
    _df['review'] = _df['review'].map(str.lower)

In [9]:
set_to_lower(df_train)
set_to_lower(df_test)

In [10]:
df_train

Unnamed: 0,review,label
0,this was an absolutely terrible movie. don't b...,0
1,"i have been known to fall asleep during films,...",0
2,mann photographs the alberta rocky mountains i...,0
3,this is the kind of film for a snowy sunday af...,1
4,"as others have mentioned, all the women that g...",1
...,...,...
24995,"i have a severe problem with this show, severa...",0
24996,"the year is 1964. ernesto ""che"" guevara, havin...",1
24997,okay. so i just got back. before i start my re...,0
24998,when i saw this trailer on tv i was surprised....,0


In [11]:
df_test

Unnamed: 0,review,label
0,there are films that make careers. for george ...,1
1,"a blackly comic tale of a down-trodden priest,...",1
2,"scary movie 1-4, epic movie, date movie, meet ...",0
3,poor shirley maclaine tries hard to lend some ...,0
4,as a former erasmus student i enjoyed this fil...,1
...,...,...
24995,"feeling minnesota is not really a road movie, ...",0
24996,"this is, without doubt, one of my favourite ho...",1
24997,most predicable movie i've ever seen...extreme...,0
24998,it's exactly what i expected from it. relaxing...,1


Dropping NaN

In [12]:
def drop_na(_df):
    _df = _df.dropna(axis = 1, how = 'all')

In [13]:
drop_na(df_train)
drop_na(df_test)

Dropping duplicates

In [14]:
def drop_duplicates(_df):
    _df.drop_duplicates(subset = 'review', keep = 'first')

In [15]:
drop_duplicates(df_train)
drop_duplicates(df_test)

In [16]:
df_train

Unnamed: 0,review,label
0,this was an absolutely terrible movie. don't b...,0
1,"i have been known to fall asleep during films,...",0
2,mann photographs the alberta rocky mountains i...,0
3,this is the kind of film for a snowy sunday af...,1
4,"as others have mentioned, all the women that g...",1
...,...,...
24995,"i have a severe problem with this show, severa...",0
24996,"the year is 1964. ernesto ""che"" guevara, havin...",1
24997,okay. so i just got back. before i start my re...,0
24998,when i saw this trailer on tv i was surprised....,0


In [17]:
df_test

Unnamed: 0,review,label
0,there are films that make careers. for george ...,1
1,"a blackly comic tale of a down-trodden priest,...",1
2,"scary movie 1-4, epic movie, date movie, meet ...",0
3,poor shirley maclaine tries hard to lend some ...,0
4,as a former erasmus student i enjoyed this fil...,1
...,...,...
24995,"feeling minnesota is not really a road movie, ...",0
24996,"this is, without doubt, one of my favourite ho...",1
24997,most predicable movie i've ever seen...extreme...,0
24998,it's exactly what i expected from it. relaxing...,1


Removing stop words

In [18]:
stopwords_list = stopwords.words('english')
def remove_stopwords(_df):
    _df['review'] = _df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_list)]))

In [19]:
remove_stopwords(df_train)
remove_stopwords(df_test)

In [20]:
df_train

Unnamed: 0,review,label
0,absolutely terrible movie. lured christopher w...,0
1,"known fall asleep films, usually due combinati...",0
2,mann photographs alberta rocky mountains super...,0
3,kind film snowy sunday afternoon rest world go...,1
4,"others mentioned, women go nude film mostly ab...",1
...,...,...
24995,"severe problem show, several actually. simple ...",0
24996,"year 1964. ernesto ""che"" guevara, cuban citize...",1
24997,"okay. got back. start review, let tell one thi...",0
24998,saw trailer tv surprised. may 2008 six flags n...,0


In [21]:
df_test

Unnamed: 0,review,label
0,"films make careers. george romero, night livin...",1
1,"blackly comic tale down-trodden priest, nazari...",1
2,"scary movie 1-4, epic movie, date movie, meet ...",0
3,poor shirley maclaine tries hard lend gravitas...,0
4,former erasmus student enjoyed film much. real...,1
...,...,...
24995,"feeling minnesota really road movie, that's st...",0
24996,"is, without doubt, one favourite horror films ...",1
24997,"predicable movie ever seen...extremely boring,...",0
24998,"exactly expected it. relaxing, humorous entert...",1


Finally, let's save the data

In [22]:
def parse_data(_df):
    x_data = _df['review'].to_numpy()
    y_data = _df['label'].to_numpy()

    return x_data, y_data

In [23]:
x_train, y_train = parse_data(df_train)
x_test, y_test = parse_data(df_test)

In [24]:
print(x_train)
print(y_train)

print(x_test)
print(y_test)

["absolutely terrible movie. lured christopher walken michael ironside. great actors, must simply worst role history. even great acting could redeem movie's ridiculous storyline. movie early nineties us propaganda piece. pathetic scenes columbian rebels making cases revolutions. maria conchita alonso appeared phony, pseudo-love affair walken nothing pathetic emotional plug movie devoid real meaning. disappointed movies like this, ruining actor's like christopher walken's good name. could barely sit it."
 'known fall asleep films, usually due combination things including, really tired, warm comfortable sette eaten lot. however occasion fell asleep film rubbish. plot development constant. constantly slow boring. things seemed happen, explanation causing why. admit, may missed part film, watched majority everything seemed happen accord without real concern anything else. cant recommend film all.'
 'mann photographs alberta rocky mountains superb fashion, jimmy stewart walter brennan give 

In [25]:
def save_array(array, save_path):
    with open(save_path, 'wb') as file:
        pickle.dump(array, file)

In [26]:
save_array(x_train, 'data/x_train.pkl')
save_array(y_train, 'data/y_train.pkl')

save_array(x_test, 'data/x_test.pkl')
save_array(y_test, 'data/y_test.pkl')