In [1]:
# from IPython.display import clear_output
# !pip install transformers datasets torch evaluate
# clear_output()

In [1]:
# importing libraries 
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Plots
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

In [3]:
# here in this dataset I found two datasets for training and testing purpose seprately
df_train = pd.read_csv(r'data\TextClassification\AGNews\train.csv', header=None)
df_train.columns=['label', 'Title', 'Description']
df_test = pd.read_csv(r'data\TextClassification\AGNews\test.csv', header=None)
df_test.columns=['label', 'Title', 'Description']

In [4]:
def combine_title_and_description(df):
    # Returns a dataset with the title and description fields combined
    df['text'] = df[['Title', 'Description']].agg('. '.join, axis=1)
    df = df.drop(['Title', 'Description'], axis=1)
    return df

In [5]:
df_train = combine_title_and_description(df_train)
df_test = combine_title_and_description(df_test)
df_train.head()

Unnamed: 0,label,text
0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new..."


In [6]:
#Tockenize and removing stopwords,punctuations and other irrelevant characters  
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return " ".join(words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fardin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
df_train['text'] = df_train['text'].apply(remove_stopwords)
df_train['label'] = df_train['label']-1

In [8]:
df_test['text']=df_test['text'].apply(remove_stopwords)
df_test['label'] = df_test['label']-1

In [2]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [11]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [12]:
# Set the format for PyTorch
train_dataset = train_dataset.rename_column('label', 'labels')
test_dataset = test_dataset.rename_column('label', 'labels')
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [13]:
training_args = TrainingArguments(
    output_dir=r'logs/OtherModels/bert_ag_results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    overwrite_output_dir=True,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=3,
    save_strategy="no"
)

# Load the metrics
import evaluate;
accuracy_metric = evaluate.load('accuracy', trust_remote_code=True)
precision_metric = evaluate.load('precision', trust_remote_code=True)
recall_metric = evaluate.load('recall', trust_remote_code=True)

def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="macro")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")
    
    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision['precision'],
        'recall': recall['recall']
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [15]:
torch.cuda.empty_cache()
import gc
gc.collect()

0

In [16]:
# Train the model
# trainer.train(resume_from_checkpoint=r"logs/OtherModels/bert_ag_results/last_epoch")
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

  0%|          | 0/30000 [00:00<?, ?it/s]

{'loss': 0.4025, 'learning_rate': 1.9666666666666666e-05, 'epoch': 0.08}
{'loss': 0.2708, 'learning_rate': 1.9333333333333333e-05, 'epoch': 0.17}
{'loss': 0.2411, 'learning_rate': 1.9e-05, 'epoch': 0.25}
{'loss': 0.223, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.33}
{'loss': 0.2246, 'learning_rate': 1.8333333333333333e-05, 'epoch': 0.42}
{'loss': 0.2065, 'learning_rate': 1.8e-05, 'epoch': 0.5}
{'loss': 0.2057, 'learning_rate': 1.7666666666666668e-05, 'epoch': 0.58}
{'loss': 0.2105, 'learning_rate': 1.7333333333333336e-05, 'epoch': 0.67}
{'loss': 0.206, 'learning_rate': 1.7e-05, 'epoch': 0.75}
{'loss': 0.1943, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.83}
{'loss': 0.2022, 'learning_rate': 1.6333333333333335e-05, 'epoch': 0.92}
{'loss': 0.1784, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0}


  0%|          | 0/380 [00:00<?, ?it/s]

{'eval_loss': 0.18419606983661652, 'eval_accuracy': 0.9421052631578948, 'eval_precision': 0.94236472615576, 'eval_recall': 0.9421052631578948, 'eval_runtime': 43.32, 'eval_samples_per_second': 175.439, 'eval_steps_per_second': 8.772, 'epoch': 1.0}
{'loss': 0.1419, 'learning_rate': 1.5666666666666667e-05, 'epoch': 1.08}
{'loss': 0.1491, 'learning_rate': 1.5333333333333334e-05, 'epoch': 1.17}
{'loss': 0.1426, 'learning_rate': 1.5000000000000002e-05, 'epoch': 1.25}
{'loss': 0.1479, 'learning_rate': 1.4666666666666666e-05, 'epoch': 1.33}
{'loss': 0.1436, 'learning_rate': 1.4333333333333334e-05, 'epoch': 1.42}
{'loss': 0.1371, 'learning_rate': 1.4e-05, 'epoch': 1.5}
{'loss': 0.1372, 'learning_rate': 1.3666666666666667e-05, 'epoch': 1.58}
{'loss': 0.1483, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.67}
{'loss': 0.1459, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.75}
{'loss': 0.1452, 'learning_rate': 1.2666666666666667e-05, 'epoch': 1.83}
{'loss': 0.1425, 'learning_rate': 1.23

  0%|          | 0/380 [00:00<?, ?it/s]

{'eval_loss': 0.18927596509456635, 'eval_accuracy': 0.9438157894736842, 'eval_precision': 0.9440456695811128, 'eval_recall': 0.9438157894736842, 'eval_runtime': 43.707, 'eval_samples_per_second': 173.885, 'eval_steps_per_second': 8.694, 'epoch': 2.0}
{'loss': 0.0968, 'learning_rate': 1.1666666666666668e-05, 'epoch': 2.08}
{'loss': 0.0922, 'learning_rate': 1.1333333333333334e-05, 'epoch': 2.17}
{'loss': 0.1038, 'learning_rate': 1.1000000000000001e-05, 'epoch': 2.25}
{'loss': 0.0977, 'learning_rate': 1.0666666666666667e-05, 'epoch': 2.33}
{'loss': 0.0954, 'learning_rate': 1.0333333333333335e-05, 'epoch': 2.42}
{'loss': 0.092, 'learning_rate': 1e-05, 'epoch': 2.5}
{'loss': 0.1046, 'learning_rate': 9.666666666666667e-06, 'epoch': 2.58}
{'loss': 0.0994, 'learning_rate': 9.333333333333334e-06, 'epoch': 2.67}
{'loss': 0.1, 'learning_rate': 9e-06, 'epoch': 2.75}
{'loss': 0.1006, 'learning_rate': 8.666666666666668e-06, 'epoch': 2.83}
{'loss': 0.1026, 'learning_rate': 8.333333333333334e-06, 'epo

  0%|          | 0/380 [00:00<?, ?it/s]

{'eval_loss': 0.20638681948184967, 'eval_accuracy': 0.9478947368421052, 'eval_precision': 0.9479190574041219, 'eval_recall': 0.9478947368421052, 'eval_runtime': 43.422, 'eval_samples_per_second': 175.026, 'eval_steps_per_second': 8.751, 'epoch': 3.0}
{'loss': 0.0569, 'learning_rate': 7.666666666666667e-06, 'epoch': 3.08}
{'loss': 0.0681, 'learning_rate': 7.333333333333333e-06, 'epoch': 3.17}
{'loss': 0.0598, 'learning_rate': 7e-06, 'epoch': 3.25}
{'loss': 0.0681, 'learning_rate': 6.666666666666667e-06, 'epoch': 3.33}
{'loss': 0.0726, 'learning_rate': 6.333333333333333e-06, 'epoch': 3.42}
{'loss': 0.0665, 'learning_rate': 6e-06, 'epoch': 3.5}
{'loss': 0.0719, 'learning_rate': 5.666666666666667e-06, 'epoch': 3.58}
{'loss': 0.0622, 'learning_rate': 5.333333333333334e-06, 'epoch': 3.67}
{'loss': 0.0681, 'learning_rate': 5e-06, 'epoch': 3.75}
{'loss': 0.0669, 'learning_rate': 4.666666666666667e-06, 'epoch': 3.83}
{'loss': 0.0676, 'learning_rate': 4.333333333333334e-06, 'epoch': 3.92}
{'loss

  0%|          | 0/380 [00:00<?, ?it/s]

{'eval_loss': 0.25739216804504395, 'eval_accuracy': 0.9455263157894737, 'eval_precision': 0.9455351814280115, 'eval_recall': 0.9455263157894737, 'eval_runtime': 43.005, 'eval_samples_per_second': 176.724, 'eval_steps_per_second': 8.836, 'epoch': 4.0}


KeyboardInterrupt: 

In [19]:
trainer.save_model(r"logs/OtherModels/bert_ag_results/last_epoch")

In [None]:
{'eval_loss': 0.20638681948184967, 'eval_accuracy': 0.9478947368421052, 'eval_precision': 0.9479190574041219, 'eval_recall': 0.9478947368421052, 'eval_runtime': 43.422, 'eval_samples_per_second': 175.026, 'eval_steps_per_second': 8.751, 'epoch': 3.0}
