In [1]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("ajaykarthick/imdb-movie-reviews")

In [2]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['review', 'label'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['review', 'label'],
        num_rows: 10000
    })
})


In [3]:
for i in range(5):
    print(f"Example {i}")
    print("Label:", ds["train"][i]["label"])
    print("Review preview:")
    print(ds["train"][i]["review"])
    print("----")



Example 0
Label: 0
Review preview:
Ms Aparna Sen, the maker of Mr & Mrs Iyer, directs this movie about a young girl's struggle to cope with her debilitating condition.<br /><br />Meethi (Konkona Sen) has been an aloof kid ever since childhood and has shown signs of delusion, no one knows why. The dormant tendency however slips out of control, when the job assignment takes her to neighboring Bihar where she's raped by some political goons. The resulting trauma also leads to episodes of manic-depressive psychosis in addition to her schizophrenia. She careens out of control over the years, progressively getting worse and sinking deeper into her private 'world'.<br /><br />The juxtaposition of an 'unsettled' (divorced) elder sister and how her domineering ways make an already bad situation worse, is indicative of what a fine line there is between abnormal and *seemingly normal*. Ms Sen also makes an excellent commentary on the social alienation of such individuals. Social rehab is standard

In [4]:
#check label distribution(dataset is balanced or not)
from collections import Counter

labels = ds["train"]["label"]
Counter(labels)


Counter({0: 20000, 1: 20000})

In [5]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)  # remove HTML tags
    text = re.sub(r"[^a-z0-9.,!?'\s]", "", text)  # keep letters, numbers, and .,!?'
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [9]:
for i in range(5):
    raw = ds["train"][i]["review"]
    cleaned = clean_text(raw)
    print(f" Original:\n{raw}\n")
    print(f" Cleaned:\n{cleaned}\n")
    print("="*80)


 Original:
Ms Aparna Sen, the maker of Mr & Mrs Iyer, directs this movie about a young girl's struggle to cope with her debilitating condition.<br /><br />Meethi (Konkona Sen) has been an aloof kid ever since childhood and has shown signs of delusion, no one knows why. The dormant tendency however slips out of control, when the job assignment takes her to neighboring Bihar where she's raped by some political goons. The resulting trauma also leads to episodes of manic-depressive psychosis in addition to her schizophrenia. She careens out of control over the years, progressively getting worse and sinking deeper into her private 'world'.<br /><br />The juxtaposition of an 'unsettled' (divorced) elder sister and how her domineering ways make an already bad situation worse, is indicative of what a fine line there is between abnormal and *seemingly normal*. Ms Sen also makes an excellent commentary on the social alienation of such individuals. Social rehab is standard therapy along with all 

In [10]:
ds = ds.map(lambda x: {"review": clean_text(x["review"])})


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [18]:
import pandas as pd

# Prepare DataFrames
train_df = pd.DataFrame({
    "review": ds["train"]["review"],
    "label": ds["train"]["label"]
})

test_df = pd.DataFrame({
    "review": ds["test"]["review"],
    "label": ds["test"]["label"]
})

# Save CSV files
train_df.to_csv("train_cleaned.csv", index=False)
test_df.to_csv("test_cleaned.csv", index=False)


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer



tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(ds["train"]["review"])

print("Total unique words in training set:", len(tokenizer.word_index))


Total unique words in training set: 156526


In [14]:
# Collect all the training reviews
train_texts = ds["train"]["review"]

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=30000, oov_token="<OOV>")  
# num_words: how many most frequent words to keep
# oov_token: for unseen words (Out-Of-Vocabulary)

# Fit the tokenizer on the training texts
tokenizer.fit_on_texts(train_texts)


In [15]:
train_sequences = tokenizer.texts_to_sequences(ds["train"]["review"])
test_sequences = tokenizer.texts_to_sequences(ds["test"]["review"])


In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen = 200
x_train = pad_sequences(train_sequences, maxlen=maxlen, padding="post", truncating="post")
x_test = pad_sequences(test_sequences, maxlen=maxlen, padding="post", truncating="post")


In [17]:
for i in range(5):
    print(f"Clean review #{i+1}:")
    print(ds["train"][i]["review"])   # still a string
    print("Tokenized vector (first 20 tokens):")
    print(x_train[i][:20])            # numpy array of integers
    print("="*80)


Clean review #1:
ms aparna sen, the maker of mr mrs iyer, directs this movie about a young girl's struggle to cope with her debilitating condition. meethi konkona sen has been an aloof kid ever since childhood and has shown signs of delusion, no one knows why. the dormant tendency however slips out of control, when the job assignment takes her to neighboring bihar where she's raped by some political goons. the resulting trauma also leads to episodes of manicdepressive psychosis in addition to her schizophrenia. she careens out of control over the years, progressively getting worse and sinking deeper into her private 'world'. the juxtaposition of an 'unsettled' divorced elder sister and how her domineering ways make an already bad situation worse, is indicative of what a fine line there is between abnormal and seemingly normal. ms sen also makes an excellent commentary on the social alienation of such individuals. social rehab is standard therapy along with all the deadly mindaltering d