# Loading Dataset

In [2]:
import kagglehub
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
# Encode sentiment (positive -> 1, negative -> 0)
df['label'] = (df['sentiment'] == 'positive').astype(int)

# Drop the original sentiment column as we now have encoded labels
df = df[['review', 'label']]

# Split the data into training, validation, and testing sets
# First split into train and temp (80-20 split)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Split temp into validation and test (50-50 split, resulting in 10-10 split of original data)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print("Training set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)
print("Test set shape:", test_df.shape)

# Display a few examples from the training set
print("\nSample from training set:")
print(train_df.head())

Training set shape: (40000, 2)
Validation set shape: (5000, 2)
Test set shape: (5000, 2)

Sample from training set:
                                                  review  label
39087  That's what I kept asking myself during the ma...      0
30893  I did not watch the entire movie. I could not ...      0
45278  A touching love story reminiscent of In the M...      1
16398  This latter-day Fulci schlocker is a totally a...      0
13653  First of all, I firmly believe that Norwegian ...      0


In [5]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
class IMDBDataset(Dataset):
    def __init__(self, reviews, labels, max_length=256):
        # Tokenize all reviews at once
        self.encodings = tokenizer(
            reviews.tolist(),
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors='pt'
        )
        # Convert labels to a tensor (ensuring they are of type long)
        self.labels = torch.tensor(labels.values, dtype=torch.long)

    def __getitem__(self, idx):
        # For each key in the encodings dictionary, get the corresponding element for idx
        item = {key: val[idx] for key, val in self.encodings.items()}
        # Add the label for the current index
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset objects for training, validation, and testing
train_dataset = IMDBDataset(train_df['review'], train_df['label'])
val_dataset = IMDBDataset(val_df['review'], val_df['label'])
test_dataset = IMDBDataset(test_df['review'], test_df['label'])

In [9]:
# Print the tokenized output for the first training example
first_example = train_dataset[0]
print("\nFirst tokenized training example:")
for key, value in first_example.items():
    print(f"{key}: {value}")
    
# To see a human-readable version of the tokenized input:
print("\nDecoded tokens from the first training example:")
print(tokenizer.decode(first_example['input_ids']))


First tokenized training example:
input_ids: tensor([  101,  2008,  1005,  1055,  2054,  1045,  2921,  4851,  2870,  2076,
         1996,  2116,  9590,  1010,  7491,  3503,  1010, 25082,  1998,  2236,
        26865,  2008,  2566,  4168,  3686,  1996,  6391,  2781,  1012,  1996,
        18539,  2036,  3233,  2039,  2043,  2017,  2228,  1997,  1996,  2028,
         1011,  8789,  3494,  1010,  2040,  2031,  2061,  2210,  5995,  2008,
         2009,  2003,  8990,  5263,  2000,  2729,  2054,  6433,  2000,  2068,
         1012,  2027,  2024,  2074,  6649,  2517, 22330, 27921,  2015,  2005,
         1996,  2472,  2000,  6865,  2010, 27135,  9029,  2006,  1010,  1037,
         8476,  2008,  2038,  2042,  2589,  2172,  2488,  1999,  2060, 16547,
         2119,  2006,  2694,  1998,  1996,  5988,  1012,  1026,  7987,  1013,
         1028,  1026,  7987,  1013,  1028,  1045,  2442, 18766,  1010,  1045,
         1005,  1049,  2025,  2428,  2028,  2005, 27963,  2919,  4616,  2076,
         1037,  21

In [10]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [12]:
import numpy as np
import evaluate
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# ---------------------------
# Step 1: Load the Pre-Trained Model
# ---------------------------
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=2  # Because we have two classes (positive and negative)
)

# ---------------------------
# Step 2: Define the Evaluation Metrics
# ---------------------------
# Load the metric objects from the evaluate library
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Get the predicted class by taking the argmax of the logits
    predictions = np.argmax(logits, axis=-1)
    
    # Compute each metric
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels)
    recall = recall_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    
    # Return a dictionary with the metric values
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }

# ---------------------------
# Step 3: Set Up Training Arguments
# ---------------------------
training_args = TrainingArguments(
    output_dir='./results',               # Where to save the model checkpoints
    num_train_epochs=2,                   # Train for 2 epochs
    per_device_train_batch_size=16,       # Batch size for training (adjust to 32 if your hardware allows)
    per_device_eval_batch_size=16,        # Batch size for evaluation
    learning_rate=5e-5,                   # Learning rate
    evaluation_strategy="epoch",          # Evaluate at the end of each epoch
    logging_strategy="epoch",             # Log metrics at the end of each epoch
    save_strategy="epoch",                # Save model checkpoint at the end of each epoch
    report_to="none",                     # Disable logging to external platforms (e.g., wandb)
)



In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,   # Your training dataset
    eval_dataset=val_dataset,        # Your validation dataset
    compute_metrics=compute_metrics  # Our defined metrics function
)

# ---------------------------
# Step 5: Fine-Tune the Model
# ---------------------------
print("Starting training...")
trainer.train()
print("Training complete!")

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2796,0.251944,0.907,0.929899,0.880448,0.904498
2,0.1406,0.269572,0.92,0.917031,0.923631,0.920319


Training complete!


In [15]:
!pip install transformers datasets huggingface_hub
from huggingface_hub import notebook_login



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
# Login to Hugging Face
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [29]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from huggingface_hub import notebook_login

model_save_path = ".my-finetuned-model"
trainer.model.save_pretrained(model_save_path, safe_serialization=False)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved locally in '{model_save_path}'.")

# print("Logging into Hugging Face Hub...")
# notebook_login()

trainer.push_to_hub(
    commit_message="Initial commit: Fine-tuned DistilBERT on IMDB dataset",
)

print("Model has been pushed to the Hugging Face Hub.")


Model and tokenizer saved locally in '.my-finetuned-model'.


No files have been modified since last commit. Skipping to prevent empty commit.


Model has been pushed to the Hugging Face Hub.


In [36]:
from huggingface_hub import HfApi

model_id = "arashghsz/results"  # Update with your actual model repo

api = HfApi()
api.upload_folder(
    folder_path=".my-finetuned-model",
    repo_id=model_id,
    repo_type="model"
)

print("Tokenizer has been pushed to the Hugging Face Hub.")

No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer has been pushed to the Hugging Face Hub.


In [37]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

model_id = "arashghsz/my-finetuned-model"  # Update with your model repo ID

# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Create a pipeline for text classification
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Test inference
test_text = "This movie was fantastic!"
result = classifier(test_text)

print(result)  # Should output a label (positive/negative) with confidence score


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.9971451163291931}]


In [3]:
import requests

API_URL = "https://api-inference.huggingface.co/models/arashghsz/my-finetuned-model"
API_TOKEN = ""  # Replace with your actual token

headers = {"Authorization": f"Bearer {API_TOKEN}"}
data = {"inputs": "This movie was amazing!"}

response = requests.post(API_URL, headers=headers, json=data)
print(response.json())  # Should return model predictions


[[{'label': 'LABEL_1', 'score': 0.9972766041755676}, {'label': 'LABEL_0', 'score': 0.0027233967557549477}]]


### Fine-Tuned DistilBERT Model

Model available on Hugging Face Hub and Youtube demo:  
🔗 [Hugging Face Model Link](https://huggingface.co/arashghsz/my-finetuned-model)
- [Demo on Youtube](https://www.youtube.com/watch?v=uJmQum-qjiQ)
- [GitHub repo](https://github.com/Arashghsz/Sentiment-Analysis-System)
