<a href="https://colab.research.google.com/github/Ani1211999/DocumentClassificationusingLLM/blob/main/document_classification_using_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install the required libraries

In [None]:
!pip3 install transformers
!pip3 install accelerate -U
!pip3 install datasets

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Initialize BERT and initialize working in GPU environment

In [None]:
import torch
model_name = "bert-base-uncased"
max_length = 512
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device: ",device)

device:  cuda


In [None]:
from transformers import BertTokenizerFast,BertForSequenceClassification
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

# Import dataset and create training and testing datasets

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

dataset = fetch_20newsgroups(subset="all", shuffle=True, remove=("headers","footers", "quotes"))
target_names=dataset.target_names
news_text = dataset.data
labels = dataset.target
print("The Twenty newsgroups",target_names)
print("Sample Text in the dataset\n", news_text[0])
print("The NewsGroup for the sample text is", target_names[labels[0]])
(train_texts,valid_texts,train_labels,valid_labels)=train_test_split(news_text, labels, test_size=0.3, random_state = 42)

The Twenty newsgroups ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
Sample Text in the dataset
 

I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens a

# Create BERT Embeddings for the textual data and create modified datasets for BERT Training

In [None]:
import torch
from datasets import Dataset
# Tokenize and encode test data

train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
test_encodings = tokenizer(valid_texts, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")

# Convert labels to tensors
train_labels_tensor = torch.tensor(train_labels,dtype=torch.long)  # Example labels for training data
test_labels_tensor = torch.tensor(valid_labels,dtype=torch.long)   # Example labels for test data

# Create a Hugging Face Dataset object
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels':  torch.tensor(train_labels) # Assuming y_train contains the labels
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels':  torch.tensor(valid_labels) # Assuming y_train contains the labels
})

# Creating Evaluation metrics for analyzing the training progress and also for testing on test data finally

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# Loading the pretrained BERT Model and setting the training arguments for classification training

In [None]:
model=BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir = '/content/drive/MyDrive/NLP/Model_Outputs/',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    # batch size per device during training
    weight_decay=0.01,               # strength of weight decay
    load_best_model_at_end=True,
    evaluation_strategy="steps",
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-04-29 11:34:24.916366: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-29 11:34:24.916423: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-29 11:34:24.917833: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Start the training process by providing training and evaluation datasets and the necessary training arguments and metrics

In [None]:
trainer = Trainer( model=model,args=training_args,compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Currently logged in as: [33maniketshinde12[0m ([33mrptu-de[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Accuracy,F1
500,1.2491,0.961582,0.707817,0.703942
1000,0.5926,0.920411,0.732756,0.73476
1500,0.3294,0.961818,0.749027,0.751798
2000,0.191,0.986947,0.758755,0.760031




TrainOutput(global_step=2065, training_loss=0.5771530179076853, metrics={'train_runtime': 3594.9577, 'train_samples_per_second': 18.348, 'train_steps_per_second': 0.574, 'total_flos': 1.735761000382464e+16, 'train_loss': 0.5771530179076853, 'epoch': 5.0})

# Perform evaluation on the test dataset

In [None]:
#Evaluate the model on the test dataset
eval_results = trainer.evaluate(test_dataset)

# Calculate accuracy and F1 score
accuracy = eval_results['eval_accuracy']
f1 = eval_results['eval_f1']

#Printing the Final Result on Test Data
print("Final Evaluation Loss", eval_results['eval_loss'])
print("Accuracy", accuracy)
print("F1 Score:", f1)

Final Evaluation Loss 0.9204107522964478
Accuracy 0.7327555712769721
F1 Score: 0.7347600011164772
