# NLP - Sentiment Analysis for IMDB Movie Reviews from scratch

## Load Data

In [1]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        print("Using GPU")

2025-10-07 01:03:04.952691: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-07 01:03:04.953101: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-07 01:03:05.072227: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-07 01:03:05.318947: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-07 01:03:11.138526: I external/local_xla/xla/

In [2]:
import os
import pandas as pd
from datasets import load_dataset

In [3]:
PATH_RAW_DATA = 'data/raw.csv'

In [4]:
if not os.path.exists(PATH_RAW_DATA):
    os.makedirs(os.path.dirname(PATH_RAW_DATA), exist_ok=True)
    dataset = load_dataset("imdb", split="train")
    df = pd.DataFrame(dataset)
    df.to_csv(PATH_RAW_DATA, index=False)
    print(f"Raw Data saved to {PATH_RAW_DATA}")
else:
    df = pd.read_csv(PATH_RAW_DATA)
    print(f"Raw Data loaded from {PATH_RAW_DATA}")

Raw Data saved to data/raw.csv


In [5]:
print(df)

                                                    text  label
0      I rented I AM CURIOUS-YELLOW from my video sto...      0
1      "I Am Curious: Yellow" is a risible and preten...      0
2      If only to avoid making this type of film in t...      0
3      This film was probably inspired by Godard's Ma...      0
4      Oh, brother...after hearing about this ridicul...      0
...                                                  ...    ...
24995  A hit at the time but now better categorised a...      1
24996  I love this movie like no other. Another time ...      1
24997  This film and it's sequel Barry Mckenzie holds...      1
24998  'The Adventures Of Barry McKenzie' started lif...      1
24999  The story centers around Barry McKenzie who mu...      1

[25000 rows x 2 columns]


## Preprocess Data

In [6]:
import re
import string
import nltk
import spacy
import contractions
from unidecode import unidecode
from nltk.corpus import stopwords

In [7]:
PATH_PROCESSED_DATA = 'data/processed.csv'

In [8]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [9]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def preprocess_text(text, remove_stopwords=True, save_data=False):
    # Normalize encoding
    text = unidecode(text)

    # Lowercase
    text = text.lower()

    # Expand contractions
    text = contractions.fix(text)

    # Remove HTML tags and URLs
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"http\S+|www\S+", " ", text)

    # Remove punctuation (keep ! and ? as they can carry sentiment)
    text = re.sub(r"[^a-zA-Z0-9!?']", " ", text)

    # Tokenize and Lemmatize using SpaCy
    doc = nlp(text)
    tokens = []
    for token in doc:
        lemma = token.lemma_.strip()
        if not lemma:
            continue
        if remove_stopwords and lemma in stop_words:
            continue
        tokens.append(lemma)

    # Remove extra whitespace and join back
    clean_text = " ".join(tokens)
    clean_text = re.sub(r"\s+", " ", clean_text).strip()
    clean_text = re.sub(r"[!?]{2,}", lambda m: m.group(0)[0], text)
    clean_text = re.sub(r"[^a-z0-9!?']", " ", text)

    if save_data:
        df['text'] = df['text'].apply(lambda x: preprocess_text(x, remove_stopwords))
        df.to_csv(PATH_PROCESSED_DATA, index=False)

    return clean_text

In [11]:
import tqdm

if not os.path.exists(PATH_PROCESSED_DATA):
    os.makedirs(os.path.dirname(PATH_PROCESSED_DATA), exist_ok=True)
    texts = df['text'].tolist()
    # texts = df['text'].tolist()[:20]
    # df = df.iloc[:20].copy()
    # Batch process with tqdm for progress bar
    processed_texts = []
    for doc in tqdm.tqdm(nlp.pipe(texts, batch_size=1000, disable=["parser", "ner"])):
        tokens = [token.lemma_ for token in doc if token.lemma_ and token.lemma_ not in stop_words]
        clean_text = " ".join(tokens)
        processed_texts.append(clean_text)
    df['text'] = processed_texts
    df.to_csv(PATH_PROCESSED_DATA, index=False)
    print(f"Processed Data saved to {PATH_PROCESSED_DATA}")
else:
    df = pd.read_csv(PATH_PROCESSED_DATA)
    print(f"Processed Data loaded from {PATH_PROCESSED_DATA}")


25000it [08:08, 51.22it/s] 


Processed Data saved to data/processed.csv


In [12]:
for i in df.index[:5]:
    print(f"Original: {df.loc[i, 'text']}")
    print(f"Processed: {preprocess_text(df.loc[i, 'text'], remove_stopwords=False)}\n")

Original: I rent I CURIOUS - yellow video store controversy surround first release 1967 . I also hear first seize U.S. custom ever try enter country , therefore fan film consider " controversial " I really see myself.<br /><br />The plot center around young swedish drama student name Lena want learn everything life . particular want focus attention make sort documentary average Swede think certain political issue Vietnam War race issue United States . ask politician ordinary denizen Stockholm opinion politic , sex drama teacher , classmate , marry men.<br /><br />What kill I I CURIOUS - YELLOW 40 year ago , consider pornographic . really , sex nudity scene far , even shoot like cheaply make porno . countryman mind find shocking , reality sex nudity major staple swedish cinema . even Ingmar Bergman , arguably answer good old boy John Ford , sex scene films.<br /><br />i commend filmmaker fact sex show film show artistic purpose rather shock people make money show pornographic theater Am