# 1. Import Libraries

In [19]:
import os
import glob
from typing import List, Tuple, Dict, Any
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
from datasets import Dataset
import evaluate
from sklearn.metrics import classification_report, accuracy_score

# 2. Large Dataset Prep and Tokenization

### Data Loading

In [4]:
def load_and_structure_imdb(base_path: str) -> pd.DataFrame:
    data: List[Dict[str, Any]] = []
    for polarity in ['pos', 'neg']:
        label = 1 if polarity == 'pos' else 0
        folder_path = os.path.join(base_path, polarity)
        for file_path in glob.glob(os.path.join(folder_path, '*.txt')):
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    review_text = f.read()
                    data.append({'review': review_text, 'sentiment': label})
            except Exception as e:
                print(f"Error reading file {file_path}: {e}")
    df = pd.DataFrame(data)
    df = df.sample(frac=1).reset_index(drop=True)

    return df

TRAIN_PATH = '/content/aclImdb/train'
TEST_PATH = '/content/aclImdb/test'

print("Loading and structuring training data...")
train_df = load_and_structure_imdb(TRAIN_PATH)

print("Loading and structuring test data...")
test_df = load_and_structure_imdb(TEST_PATH)

print("\n--- Training Data Check ---")
print(train_df.head())
print(f"\nTraining set size: {len(train_df)} reviews.")
print(f"Training Sentiment Counts:\n{train_df['sentiment'].value_counts()}")

print("\n--- Test Data Check ---")
print(f"\nTest set size: {len(test_df)} reviews.")
print(f"Test Sentiment Counts:\n{test_df['sentiment'].value_counts()}")

Loading and structuring training data...
Loading and structuring test data...

--- Training Data Check ---
                                              review  sentiment
0  I had high expectations following "My Beautifu...          0
1  In 1942 a film TALES OF MANHATTAN told a set o...          1
2  Having the opportunity to watch some of the fi...          1
3  **** = A masterpiece to be recorded in the boo...          1
4  Election is a Chinese mob movie, or triads in ...          1

Training set size: 25000 reviews.
Training Sentiment Counts:
sentiment
0    12500
1    12500
Name: count, dtype: int64

--- Test Data Check ---

Test set size: 25000 reviews.
Test Sentiment Counts:
sentiment
0    12500
1    12500
Name: count, dtype: int64


### Define Model

In [5]:
MODEL_NAME = "distilbert-base-uncased"

###  Convert df to hugging face dataset format and objects

In [6]:
train_df_hf = train_df.rename(columns={'review': 'text', 'sentiment': 'label'})
test_df_hf = test_df.rename(columns={'review': 'text', 'sentiment': 'label'})

In [7]:
train_dataset = Dataset.from_pandas(train_df_hf)
test_dataset = Dataset.from_pandas(test_df_hf)

###  Load and apply Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

### Final Preparation

In [10]:
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

print("--- Final Dataset Preparation Complete ---")
print(f"Tokenized Training Data Columns: {tokenized_train.column_names}")
print(f"Tokenized Test Data Columns: {tokenized_test.column_names}")

--- Final Dataset Preparation Complete ---
Tokenized Training Data Columns: ['label', 'input_ids', 'attention_mask']
Tokenized Test Data Columns: ['label', 'input_ids', 'attention_mask']


# 3. Small Dataset Training

**This block sets up and trains the model on the 1250 subset.**

In [23]:

SMALL_SIZE = 1250

print(f"Selecting {SMALL_SIZE} samples for the Small Dataset Scenario...")
tokenized_train_small = tokenized_train.select(range(SMALL_SIZE))
tokenized_test_small = tokenized_test.select(range(SMALL_SIZE))

# Load a model instance
model_small = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Training arguments for small set (more epochs)
training_args_small = TrainingArguments(
    output_dir="./results_small",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_dir='./logs_small',
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
)

trainer_small = Trainer(
    model=model_small,
    args=training_args_small,
    train_dataset=tokenized_train_small,
    eval_dataset=tokenized_test_small,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("\n--- Starting Small Dataset Fine-Tuning ---")
trainer_small.train()

final_results_small = trainer_small.evaluate(tokenized_test_small)

print("\n--- Small Model Evaluation Results ---")
print(final_results_small)

Selecting 1250 samples for the Small Dataset Scenario...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_small = Trainer(



--- Starting Small Dataset Fine-Tuning ---


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5754,0.34693,0.8528
2,0.2282,0.32241,0.8776
3,0.1257,0.39736,0.872
4,0.0552,0.425934,0.8848
5,0.0678,0.435396,0.8872



--- Small Model Evaluation Results ---
{'eval_loss': 0.43539613485336304, 'eval_accuracy': 0.8872, 'eval_runtime': 18.1748, 'eval_samples_per_second': 68.777, 'eval_steps_per_second': 4.347, 'epoch': 5.0}


# 4. Large Dataset Model Training

**This block trains the model on the full dataset.**

### Load the model

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Define metrics

In [12]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)


Downloading builder script: 0.00B [00:00, ?B/s]

### Set Training Arguments

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_dir='./logs',
    logging_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

### Initialize the trainer

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


### Train the model

In [15]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcharmaine-mawande[0m ([33mcharmaine-mawande-iu-international-university-of-applied[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2239,0.194609,0.92596
2,0.1627,0.238055,0.9312
3,0.0837,0.272803,0.93304


TrainOutput(global_step=4689, training_loss=0.1685686695166401, metrics={'train_runtime': 5096.198, 'train_samples_per_second': 14.717, 'train_steps_per_second': 0.92, 'total_flos': 9935054899200000.0, 'train_loss': 0.1685686695166401, 'epoch': 3.0})

#5. Final Evaluation and Inference

In [17]:
# 1. Get predictions from the trainer on the test set
predictions = trainer.predict(tokenized_test)
logits = predictions.predictions
true_labels = predictions.label_ids

# 2. Convert raw logits to the final class prediction (0 or 1)
predicted_labels = np.argmax(logits, axis=1)

# 3. Generate Classification Report
# Target names are defined by the labels (0: Negative, 1: Positive)
target_names = ['Negative (0)', 'Positive (1)']
report = classification_report(true_labels, predicted_labels, target_names=target_names, digits=4)

print("\n--- Model Evaluation: Detailed Classification Report ---")
print(report)
print(f"Overall Accuracy: {accuracy_score(true_labels, predicted_labels):.4f}")


--- Model Evaluation: Detailed Classification Report ---
              precision    recall  f1-score   support

Negative (0)     0.9391    0.9110    0.9248     12500
Positive (1)     0.9136    0.9410    0.9271     12500

    accuracy                         0.9260     25000
   macro avg     0.9263    0.9260    0.9259     25000
weighted avg     0.9263    0.9260    0.9259     25000

Overall Accuracy: 0.9260


In [22]:
# FINAL INFERENCE DEMO
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    function_to_apply="none"
)

new_reviews = [
    "The cinematography was breathtaking, the performances were deeply moving, and the film deserves every award.",
    "I walked out halfway through. The plot was slow, the dialogue was cringeworthy, and the ending was predictable garbage.",
    "It's a decent watch on a rainy Sunday, nothing innovative, but certainly not a waste of two hours.",
]

print("\n--- Final Model Inference on Unseen Reviews ---")
results = sentiment_pipeline(new_reviews)

for review, result in zip(new_reviews, results):
    # CORRECTED: Parse the integer index from the string 'LABEL_X'
    predicted_index = int(result['label'].split('_')[-1])

    # Map the index back to the required sentiment classification
    final_sentiment = "Positive" if predicted_index == 1 else "Negative"

    print(f"\nReview: '{review[:70]}...'")
    print(f"-> Classified Sentiment: {final_sentiment}")
    print(f"   Raw Logit Scores: {result['score']}")

Device set to use cuda:0



--- Final Model Inference on Unseen Reviews ---

Review: 'The cinematography was breathtaking, the performances were deeply movi...'
-> Classified Sentiment: Positive
   Raw Logit Scores: 2.4924569129943848

Review: 'I walked out halfway through. The plot was slow, the dialogue was crin...'
-> Classified Sentiment: Negative
   Raw Logit Scores: 2.648979425430298

Review: 'It's a decent watch on a rainy Sunday, nothing innovative, but certain...'
-> Classified Sentiment: Positive
   Raw Logit Scores: 0.29300954937934875
