In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#ðŸŸ© CELL 2 â€” Load Dataset (Again, from raw)
df = pd.read_csv("../data/raw/IMDB Dataset.csv")
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
#ðŸŸ© CELL 3 â€” Encode Sentiment Labels
df['label'] = df['sentiment'].map({
    'negative': 0,
    'positive': 1
})

df[['sentiment', 'label']].head()


Unnamed: 0,sentiment,label
0,positive,1
1,positive,1
2,positive,1
3,negative,0
4,positive,1


In [7]:
#CELL 4 â€” Basic Text Cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)   # remove HTML tags
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # remove special characters
    return text

df['clean_review'] = df['review'].apply(clean_text)
df[['review', 'clean_review']].head()


Unnamed: 0,review,clean_review
0,One of the other reviewers has mentioned that ...,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,a wonderful little production the filming tech...
2,I thought this was a wonderful way to spend ti...,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,basically theres a family where a little boy j...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter matteis love in the time of money is a ...


In [9]:
#ðŸŸ© CELL 5 â€” Train / Validation / Test Split
X = df['clean_review']
y = df['label']

# Train (70%), Validation (15%), Test (15%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(len(X_train), len(X_val), len(X_test))


35000 7500 7500


In [11]:
#ðŸŸ© CELL 6 â€” Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [13]:
#ðŸŸ© CELL 7 â€” Tokenize Text for BERT (CORE STEP)
def tokenize_text(texts):
    return tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

train_encodings = tokenize_text(X_train)
val_encodings   = tokenize_text(X_val)
test_encodings  = tokenize_text(X_test)


In [14]:
#ðŸŸ© CELL 8 â€” Save Preprocessed Data (VERY IMPORTANT)
import pickle

with open("../data/processed/train.pkl", "wb") as f:
    pickle.dump((train_encodings, y_train), f)

with open("../data/processed/val.pkl", "wb") as f:
    pickle.dump((val_encodings, y_val), f)

with open("../data/processed/test.pkl", "wb") as f:
    pickle.dump((test_encodings, y_test), f)
