In [None]:
!pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [None]:
import pandas as pd
import torch
import numpy as np
import os
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset
from sklearn.metrics import classification_report

In [None]:
# Disable Weights & Biases (W&B) logging
os.environ["WANDB_DISABLED"] = "true"

# Loading Dataset
file_path = "railway_complaints.csv"
df = pd.read_csv(file_path)

# Text and Labels
texts = df["Customer Complaint"].astype(str).tolist()
labels = df["Complaint Category"].astype(str).tolist()

# Encoding Labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(texts, labels_encoded, test_size=0.2, random_state=42)

# Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

#Batch Tokenization for Faster Processing
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=256, return_tensors="pt")

# Define Custom Dataset
class ComplaintDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenize_function(texts)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

#Datasets
train_dataset = ComplaintDataset(X_train, y_train)
test_dataset = ComplaintDataset(X_test, y_test)

# Loading BERT Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_classes).to(device)

#Accuracy function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training Args for BERT
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    learning_rate=3e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    gradient_accumulation_steps=2,
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

# Evaluate
eval_results = trainer.evaluate()
print(f"Test Accuracy: {eval_results['eval_accuracy'] * 100:.2f}%")



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.055697,0.875776
2,No log,1.114984,0.869565
3,No log,1.090391,0.872671
4,No log,0.945831,0.881988
5,No log,0.961159,0.891304
6,No log,0.999854,0.881988
7,0.016300,0.926416,0.872671
8,0.016300,0.938536,0.881988
9,0.016300,0.956397,0.878882


Test Accuracy: 89.13%


In [None]:
#Prediction on dataset
y_pred = np.argmax(predictions.predictions, axis=1)  # Get highest probability class
y_true = test_dataset.labels

# Classification report
category_report = classification_report(y_true, y_pred, target_names=label_encoder.classes_)
print("Our BERT Model Classification Report:\n", category_report)

Our BERT Model Classification Report:
                                      precision    recall  f1-score   support

        Coach Cleanliness & Hygiene       0.97      0.83      0.89        35
Customer Service & Staff Complaints       0.92      0.96      0.94        24
             Food & Catering Issues       0.87      0.81      0.84        16
        Luggage & Belongings Issues       0.91      0.91      0.91        11
                   Other Complaints       0.55      0.60      0.57        30
             Seat Allocation Issues       1.00      0.96      0.98        24
         Security & Safety Concerns       0.93      0.87      0.90        30
     Ticketing & Reservation Issues       0.93      0.93      0.93        45
     Train & Station Infrastructure       0.89      0.94      0.91        62
       Train Delays & Cancellations       0.96      1.00      0.98        45

                           accuracy                           0.89       322
                          macro avg

In [None]:
# Saving Model and Label Encoder
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")
print("Saved successfully!")

Saved successfully!


In [None]:
import pickle
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

In [None]:
import shutil

zip_file = "trained_model.zip"

save_directory = "./trained_model"

shutil.make_archive(save_directory, 'zip', save_directory)

print(f"Zipped model stored as {zip_file}")

Zipped model stored as trained_model.zip
