## Load needed packages

In [1]:
import polars as pl
from sklearn.model_selection import train_test_split
import re
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## Load data

In [2]:
df = pl.read_csv("../data/imdb.csv")

In [3]:
df.select(pl.col("sentiment").value_counts()).unnest(pl.col("sentiment"))

sentiment,count
str,u32
"""negative""",25000
"""positive""",25000


In [4]:
# Replace sentiment values with integers
# 1 for positive, 0 for negative
df = df.with_columns(
    pl.col("sentiment").replace("positive", 1).replace("negative", 0).cast(pl.Int8)
)

In [5]:
df.head()

review,sentiment
str,i8
"""One of the other reviewers has…",1
"""A wonderful little production.…",1
"""I thought this was a wonderful…",1
"""Basically there's a family whe…",0
"""Petter Mattei's ""Love in the T…",1


In [6]:
# Split the dataset into training and testing sets
# 80% for training, 20% for testing
# Stratified split to maintain the proportion of sentiment classes
X_train, X_test, y_train, y_test = train_test_split(
    df["review"],
    df["sentiment"],
    test_size=0.2,
    random_state=42,
    stratify=df["sentiment"],
)

In [7]:
y_train.value_counts()

sentiment,count
i8,u32
1,20000
0,20000


In [8]:
X_train = X_train.to_list()
X_test = X_test.to_list()
y_train = y_train.to_list()
y_test = y_test.to_list()

## Preprocessing

### Text cleaning (remove HTML tags, special characters)

In [9]:
def remove_html_tags(text):
    """Remove HTML tags from text."""
    clean = re.compile(
        "<.*?>"
    )  # Regex to match HTML tags, ? indicates non-greedy matching
    return re.sub(clean, "", text)

In [10]:
def remove_special_characters(text):
    """Remove special characters from text."""
    return re.sub(
        r"[^a-zA-Z0-9\s.,!?\"']", " ", text
    ).lower()  # Keep space and common punctuation marks

In [11]:
def clean_text(text):
    """Clean text by removing HTML tags and special characters."""
    text = remove_html_tags(text)
    text = remove_special_characters(text)
    return text

In [12]:
X_train = [clean_text(review) for review in X_train]
X_test = [clean_text(review) for review in X_test]

### Tokenisation

In [None]:
# Load a pre-trained tokenizer. 'bert-base-uncased' is a good general-purpose model.
# The 'uncased' means it expects lowercase input, which aligns with our cleaning.
# Setting `do_lower_case=False` because we already lowercased the text.
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case=False)

In [None]:
VOCAB_SIZE = tokenizer.vocab_size
UNK_TOKEN_ID = tokenizer.unk_token_id
PAD_TOKEN_ID = tokenizer.pad_token_id
CLS_TOKEN_ID = tokenizer.cls_token_id  # [CLS] token for classification tasks
SEP_TOKEN_ID = tokenizer.sep_token_id  # [SEP] token to separate sequences

In [15]:
VOCAB_SIZE, UNK_TOKEN_ID, PAD_TOKEN_ID, CLS_TOKEN_ID, SEP_TOKEN_ID

(30522, 100, 0, 101, 102)

### Vocabulary building (most frequent words)

### Sequence padding/truncation

### Converting to tensors