In [1]:
# from IPython.display import clear_output
# !pip install transformers datasets torch evaluate
# clear_output()

In [2]:
# importing libraries 
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Plots
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

In [3]:
# here in this dataset I found two datasets for training and testing purpose seprately
df_train = pd.read_csv(r'data\TextClassification\AGNews\train.csv', header=None)
df_train.columns=['label', 'Title', 'Description']
df_test = pd.read_csv(r'data\TextClassification\AGNews\test.csv', header=None)
df_test.columns=['label', 'Title', 'Description']

In [4]:
def combine_title_and_description(df):
    # Returns a dataset with the title and description fields combined
    df['text'] = df[['Title', 'Description']].agg('. '.join, axis=1)
    df = df.drop(['Title', 'Description'], axis=1)
    return df

In [5]:
df_train = combine_title_and_description(df_train)
df_test = combine_title_and_description(df_test)
df_train.head()

Unnamed: 0,label,text
0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new..."


In [6]:
#Tockenize and removing stopwords,punctuations and other irrelevant characters  
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return " ".join(words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fardi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
df_train['text'] = df_train['text'].apply(remove_stopwords)
df_train['label'] = df_train['label']-1

In [8]:
df_test['text']=df_test['text'].apply(remove_stopwords)
df_test['label'] = df_test['label']-1

In [9]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [12]:
# Set the format for PyTorch
train_dataset = train_dataset.rename_column('label', 'labels')
test_dataset = test_dataset.rename_column('label', 'labels')
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
training_args = TrainingArguments(
    output_dir=r'logs/OtherModels/bert_ag_results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    overwrite_output_dir=True,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=3,
    save_strategy="no"
)

# Load the metrics
import evaluate;
accuracy_metric = evaluate.load('accuracy', trust_remote_code=True)
precision_metric = evaluate.load('precision', trust_remote_code=True)
recall_metric = evaluate.load('recall', trust_remote_code=True)

def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="macro")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")
    
    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision['precision'],
        'recall': recall['recall']
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--accuracy\f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat May 18 18:36:55 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--precision\4e7f439a346715f68500ce6f2be82bf3272abd3f20bdafd203a2c4f85b61dd5f (last modified on Sat May 18 18:37:00 2024) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--recall\e40e6e98d18ff3f210f4d0b26fa721bfaa80704b1fdf890fa551cfabf94fc185 (last modified on Sat May 18 18:37:06 2024) since it couldn't be found locally

In [None]:
torch.cuda.empty_cache()
import gc
gc.collect()

0

In [None]:
# Train the model
trainer.train(resume_from_checkpoint=r"logs/OtherModels/bert_ag_results/last_epoch")

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

  0%|          | 0/60000 [00:00<?, ?it/s]

{'loss': 0.0783, 'learning_rate': 1.9833333333333335e-05, 'epoch': 3.04}
{'loss': 0.0981, 'learning_rate': 1.9666666666666666e-05, 'epoch': 3.08}
{'loss': 0.1018, 'learning_rate': 1.95e-05, 'epoch': 3.12}
{'loss': 0.1081, 'learning_rate': 1.9333333333333333e-05, 'epoch': 3.17}
{'loss': 0.0962, 'learning_rate': 1.916666666666667e-05, 'epoch': 3.21}
{'loss': 0.1299, 'learning_rate': 1.9e-05, 'epoch': 3.25}
{'loss': 0.1446, 'learning_rate': 1.8833333333333335e-05, 'epoch': 3.29}
{'loss': 0.1502, 'learning_rate': 1.866666666666667e-05, 'epoch': 3.33}
{'loss': 0.1424, 'learning_rate': 1.8500000000000002e-05, 'epoch': 3.38}
{'loss': 0.1447, 'learning_rate': 1.8333333333333333e-05, 'epoch': 3.42}
{'loss': 0.1395, 'learning_rate': 1.8166666666666667e-05, 'epoch': 3.46}
{'loss': 0.1446, 'learning_rate': 1.8e-05, 'epoch': 3.5}
{'loss': 0.1424, 'learning_rate': 1.7833333333333334e-05, 'epoch': 3.54}
{'loss': 0.143, 'learning_rate': 1.7666666666666668e-05, 'epoch': 3.58}
{'loss': 0.1505, 'learning

  0%|          | 0/760 [00:00<?, ?it/s]

{'eval_loss': 0.27091148495674133, 'eval_accuracy': 0.9425, 'eval_precision': 0.943124051263504, 'eval_recall': 0.9425, 'eval_runtime': 79.415, 'eval_samples_per_second': 95.7, 'eval_steps_per_second': 9.57, 'epoch': 4.0}
{'loss': 0.0916, 'learning_rate': 1.5833333333333333e-05, 'epoch': 4.04}
{'loss': 0.0826, 'learning_rate': 1.5666666666666667e-05, 'epoch': 4.08}
{'loss': 0.0728, 'learning_rate': 1.55e-05, 'epoch': 4.12}
{'loss': 0.082, 'learning_rate': 1.5333333333333334e-05, 'epoch': 4.17}
{'loss': 0.0937, 'learning_rate': 1.5166666666666667e-05, 'epoch': 4.21}
{'loss': 0.0829, 'learning_rate': 1.5000000000000002e-05, 'epoch': 4.25}
{'loss': 0.094, 'learning_rate': 1.4833333333333336e-05, 'epoch': 4.29}
{'loss': 0.0885, 'learning_rate': 1.4666666666666666e-05, 'epoch': 4.33}
{'loss': 0.0826, 'learning_rate': 1.45e-05, 'epoch': 4.38}
{'loss': 0.0956, 'learning_rate': 1.4333333333333334e-05, 'epoch': 4.42}
{'loss': 0.0739, 'learning_rate': 1.416666666666667e-05, 'epoch': 4.46}
{'loss

  0%|          | 0/760 [00:00<?, ?it/s]

{'eval_loss': 0.30075618624687195, 'eval_accuracy': 0.9432894736842106, 'eval_precision': 0.943577994956708, 'eval_recall': 0.9432894736842106, 'eval_runtime': 75.42, 'eval_samples_per_second': 100.769, 'eval_steps_per_second': 10.077, 'epoch': 5.0}
{'train_runtime': 7461.525, 'train_samples_per_second': 80.413, 'train_steps_per_second': 8.041, 'train_loss': 0.04610097599029541, 'epoch': 5.0}


  0%|          | 0/760 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.30075618624687195, 'eval_accuracy': 0.9432894736842106, 'eval_precision': 0.943577994956708, 'eval_recall': 0.9432894736842106, 'eval_runtime': 75.204, 'eval_samples_per_second': 101.058, 'eval_steps_per_second': 10.106, 'epoch': 5.0}


In [None]:
trainer.save_model(r"logs/OtherModels/bert_ag_results/last_epoch")

In [None]:
Evaluation results: {'eval_loss': 0.24346782267093658, 'eval_accuracy': 0.9488157894736842, 'eval_precision': 0.9489294182155664, 'eval_recall': 0.9488157894736842, 'eval_runtime': 78.778, 'eval_samples_per_second': 96.474, 'eval_steps_per_second': 9.647, 'epoch': 3.0}