                                                   TEXT CLASSIFICATION USING DISTILL - BERT MODEL

 Load and Preprocess Excel Dataset

In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split

# Load the Excel file
df = pd.read_excel("/content/output_dataset.xlsx")
df.columns = ["label", "text"]  # Rename the two columns properly

# Preview
print(df.head())


# Preview
print(df.head())

# Replace 'text' and 'label' below with actual column names if different
texts = df['text'].astype(str).tolist()
labels = df['label'].tolist()

# Train/validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


   label                                               text
0      0  apparently reassembled from the cutting-room f...
1      0  they presume their audience wo n't sit still f...
2      1  this is a visually stunning rumination on love...
3      1  jonathan parker 's bartleby should have been t...
4      1  campanella gets the tone just right -- funny i...
   label                                               text
0      0  apparently reassembled from the cutting-room f...
1      0  they presume their audience wo n't sit still f...
2      1  this is a visually stunning rumination on love...
3      1  jonathan parker 's bartleby should have been t...
4      1  campanella gets the tone just right -- funny i...


 Dataset & Dataloader (PyTorch)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# Custom dataset class
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        } | {"labels": torch.tensor(self.labels[idx])}

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


Model and Training Loop

In [None]:
from transformers import DistilBertForSequenceClassification
from torch.optim import AdamW # Import AdamW from torch.optim
from tqdm import tqdm

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2  # Binary classification
)
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch + 1}")
        loop.set_postfix(loss=loss.item())

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 346/346 [00:39<00:00,  8.71it/s, loss=0.123]
Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 346/346 [00:41<00:00,  8.35it/s, loss=0.0714]
Epoch 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 346/346 [00:41<00:00,  8.40it/s, loss=0.0246]


 Evaluation (Accuracy + Report)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

print("Accuracy:", accuracy_score(true_labels, predictions))
print("\nClassification Report:\n", classification_report(true_labels, predictions))


Accuracy: 0.8908959537572254

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.89      0.89       653
           1       0.90      0.89      0.90       731

    accuracy                           0.89      1384
   macro avg       0.89      0.89      0.89      1384
weighted avg       0.89      0.89      0.89      1384



Save Model

In [None]:
model.save_pretrained("distilbert_model")
tokenizer.save_pretrained("distilbert_model")


('distilbert_model/tokenizer_config.json',
 'distilbert_model/special_tokens_map.json',
 'distilbert_model/vocab.txt',
 'distilbert_model/added_tokens.json',
 'distilbert_model/tokenizer.json')

Load and preprocess the test dataset

In [None]:
# Load test data using pandas' Excel reader
test_df = pd.read_excel("/content/test_dataset.xlsx")
# Assign column names to test_df instead of df
test_df.columns = ["label", "text"]

# Preview
print(test_df.head())

# Tokenize test text
test_texts = test_df['text'].astype(str).tolist()
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# If test labels are available
test_labels = test_df['label'].tolist() if 'label' in test_df.columns else None

   label                                               text
0      0  a gob of drivel so sickly sweet , even the eag...
1      0  gangs of new york is an unapologetic mess , wh...
2      0  we never really feel involved with the story ,...
3      1            this is one of polanski 's best films .
4      1  take care of my cat offers a refreshingly diff...


Create Test Dataset and Loader

In [None]:
# Dataset class for test
class TestDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

# Create dataset and dataloader
test_dataset = TestDataset(test_encodings, test_labels)
test_loader = DataLoader(test_dataset, batch_size=16)


Predict on Test Set

In [None]:
model.eval()
test_preds = []
true_test_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch.get("labels")

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        test_preds.extend(preds.cpu().numpy())

        if labels is not None:
            true_test_labels.extend(labels.numpy())

# If labels exist, show performance
if test_labels is not None:
    print("Test Accuracy:", accuracy_score(true_test_labels, test_preds))
    print("\nTest Classification Report:\n", classification_report(true_test_labels, test_preds))
else:
    print("Predictions:", test_preds)


Test Accuracy: 0.8846153846153846

Test Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.87      0.88       911
           1       0.87      0.90      0.89       909

    accuracy                           0.88      1820
   macro avg       0.88      0.88      0.88      1820
weighted avg       0.88      0.88      0.88      1820



 Predict on a Single Custom Input

In [None]:
def predict_text(text):
    model.eval()

    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    sentiment = "Positive" if predicted_class == 1 else "Negative"

    print(f"Text: {text}")
    print(f"Predicted Label: {predicted_class} ({sentiment})")
    return predicted_class, sentiment

# ‚ûï Example: Positive
predict_text("This movie was absolutely amazing, I loved it!")

# ‚ûñ Example: Negative
predict_text("The movie was incredibly boring and a complete waste of time.")


Text: This movie was absolutely amazing, I loved it!
Predicted Label: 1 (Positive)
Text: The movie was incredibly boring and a complete waste of time.
Predicted Label: 0 (Negative)


(0, 'Negative')

Save the Trained Model & Tokenizer as pkl file

In [None]:
import joblib
import os

# Create a directory to save model
model_dir = "saved_model"
os.makedirs(model_dir, exist_ok=True)

# Save the model and tokenizer using Hugging Face's built-in methods
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

# Optional: Save path in a .pkl for Streamlit loading if needed
joblib.dump(model_dir, "model_path.pkl")


['model_path.pkl']

In [None]:
!pip install transformers
!pip install streamlit
!pip install pyngrok


Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.3/44.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.44.1-py3-none-any.whl (9.8 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.8/9.8 MB[0m [31m122.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [

#app.py

In [56]:
%%writefile app.py
import streamlit as st
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch

# Load model and tokenizer
model_path = "saved_model"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Prediction function
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()
    return "Positive üòä" if prediction == 1 else "Negative üòû"

# Streamlit UI
st.title("Text Sentiment Classifier")
user_input = st.text_area("Enter a movie review:")
if st.button("Predict Sentiment"):
    if user_input.strip() == "":
        st.warning("Please enter a review.")
    else:
        result = predict(user_input)
        st.success(f"Sentiment: {result}")


Writing app.py


In [57]:
!ngrok authtoken '2vtncKmOpdsym2Q5gliptuAZdv3_6Ta2ZboA3czHfLeEMp6vq' # Replace <your_authtoken> with your actual token

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [58]:
from pyngrok import ngrok

# Kill existing tunnels if rerunning
!kill -9 $(lsof -t -i:8501)

# Disconnect all existing ngrok tunnels
ngrok.kill() # This will terminate all active ngrok tunnels

# Run Streamlit
!streamlit run app.py &>/content/logs.txt &

# Start a new tunnel
public_url = ngrok.connect(addr=8501, proto="http", bind_tls=True)
print("Streamlit app URL:", public_url)

Streamlit app URL: NgrokTunnel: "https://36fb-34-124-175-82.ngrok-free.app" -> "http://localhost:8501"
