Install Dependencies

In [1]:
pip install transformers datasets torch pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


Load Dataset

In [2]:
import pandas as pd
from transformers import BertTokenizer

# Load dataset
df = pd.read_csv("D:\\data science QT\\notes\\DataSets\\IMDB Dataset.csv\\IMDB Dataset.csv")

# Tokenize text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer(df['review'][0], truncation=True, padding=True, max_length=512)
print(tokens)


  from .autonotebook import tqdm as notebook_tqdm


{'input_ids': [101, 2028, 1997, 1996, 2060, 15814, 2038, 3855, 2008, 2044, 3666, 2074, 1015, 11472, 2792, 2017, 1005, 2222, 2022, 13322, 1012, 2027, 2024, 2157, 1010, 2004, 2023, 2003, 3599, 2054, 3047, 2007, 2033, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 2034, 2518, 2008, 4930, 2033, 2055, 11472, 2001, 2049, 24083, 1998, 4895, 10258, 2378, 8450, 5019, 1997, 4808, 1010, 2029, 2275, 1999, 2157, 2013, 1996, 2773, 2175, 1012, 3404, 2033, 1010, 2023, 2003, 2025, 1037, 2265, 2005, 1996, 8143, 18627, 2030, 5199, 3593, 1012, 2023, 2265, 8005, 2053, 17957, 2007, 12362, 2000, 5850, 1010, 3348, 2030, 4808, 1012, 2049, 2003, 13076, 1010, 1999, 1996, 4438, 2224, 1997, 1996, 2773, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2009, 2003, 2170, 11472, 2004, 2008, 2003, 1996, 8367, 2445, 2000, 1996, 17411, 4555, 3036, 2110, 7279, 4221, 12380, 2854, 1012, 2009, 7679, 3701, 2006, 14110, 2103, 1010, 2019, 6388, 2930, 1997, 1996, 3827, 2073, 2035, 1996, 4442, 2031, 3221, 21430,

Train BERT Sentiment Classifier

In [3]:
pip install --upgrade transformers[torch] accelerate





In [4]:
print(df.columns)


Index(['review', 'sentiment'], dtype='object')


In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Load dataset (Assuming df is already defined)
print("Original DataFrame columns:", df.columns)  # Debugging step

# Convert Pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Check dataset column names
print("Dataset column names:", dataset.column_names)  # Debugging step

# Rename 'sentiment' to 'labels' for training
if "sentiment" in dataset.column_names:
    dataset = dataset.rename_column("sentiment", "labels")

# Convert labels to integers if they are categorical (e.g., "positive", "neutral", "negative")
if isinstance(dataset["labels"][0], str):
    label_mapping = {"negative": 0, "neutral": 1, "positive": 2}
    dataset = dataset.map(lambda x: {"labels": label_mapping[x["labels"]]})

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)

# Apply tokenization
dataset = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns (keep only tokenized data and labels)
columns_to_remove = ["review"]  # Only remove 'review' since '__index_level_0__' doesn't exist
for col in columns_to_remove:
    if col in dataset.column_names:
        dataset = dataset.remove_columns(col)

# Ensure dataset has enough rows before splitting
num_samples = len(dataset)
if num_samples < 40000:
    raise ValueError(f"Dataset has only {num_samples} samples. Ensure it has at least 40,000 for splitting.")

# Split dataset into train and eval sets
train_dataset = dataset.select(range(40000))
eval_dataset = dataset.select(range(40000, num_samples))

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train model
trainer.train()



Original DataFrame columns: Index(['review', 'sentiment'], dtype='object')
Dataset column names: ['review', 'sentiment']


Map: 100%|██████████| 50000/50000 [00:09<00:00, 5529.94 examples/s]
Map: 100%|██████████| 50000/50000 [08:52<00:00, 93.93 examples/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 