1. Tokenization Model - Word Tokenizer

In [1]:
import pandas as pd
import re
from collections import Counter

# Load dataset
# Replace 'path_to_dataset.csv' with the actual path to the IMDb dataset on your system.
df = pd.read_csv("/kaggle/input/imdb-movie-reviews/IMDB Dataset.csv")

# Inspect the dataset
print(df.head())

# Assume the dataset has a column named 'review' with textual data.
# Preprocessing function
def preprocess_text(text):
    """
    Clean and preprocess text.
    - Lowercase conversion
    - Remove special characters and punctuation
    - Remove extra whitespace
    """
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra whitespace
    return text

# Apply preprocessing to the dataset
df["cleaned_review"] = df["review"].apply(preprocess_text)

# Tokenization function
def tokenize_text(text):
    """
    Split text into tokens (words).
    """
    return text.split()

# Tokenize the cleaned reviews
df["tokens"] = df["cleaned_review"].apply(tokenize_text)

# Build Vocabulary
def build_vocabulary(tokenized_texts):
    """
    Build a vocabulary from tokenized texts.
    """
    # Flatten the list of tokens
    all_tokens = [token for tokens in tokenized_texts for token in tokens]
    # Count frequency of each token
    token_counts = Counter(all_tokens)
    # Assign a unique ID to each token (starting from 1, reserve 0 for padding)
    vocabulary = {token: idx for idx, (token, _) in enumerate(token_counts.items(), start=1)}
    return vocabulary

# Create vocabulary from all tokenized reviews
vocabulary = build_vocabulary(df["tokens"])

# Display vocabulary size
print(f"Vocabulary size: {len(vocabulary)}")

# Token-to-ID mapping function
def tokens_to_ids(tokens, vocab):
    """
    Convert tokens to their corresponding IDs based on the vocabulary.
    """
    return [vocab[token] for token in tokens if token in vocab]

# Map tokens to IDs
df["token_ids"] = df["tokens"].apply(lambda tokens: tokens_to_ids(tokens, vocabulary))

# Inspect the processed data
print(df[["review", "cleaned_review", "tokens", "token_ids"]].head())

# Save the tokenizer and vocabulary for reuse
import pickle

with open("tokenizer_vocab.pkl", "wb") as f:
    pickle.dump(vocabulary, f)

print("Tokenizer and vocabulary saved!")


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Vocabulary size: 181066
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production br br the filmin...   
2  i thought this was a wonderful way to spend ti...   
3  basically theres a family where a 

In [2]:
import pandas as pd

# Convert vocabulary to a DataFrame
vocab_df = pd.DataFrame(list(vocabulary.items()), columns=["Word", "ID"])

# Save to CSV
vocab_df.to_csv("vocabulary.csv", index=False)

print("Vocabulary saved to 'vocabulary.csv'.")


Vocabulary saved to 'vocabulary.csv'.
