In [1]:
# from IPython.display import clear_output
# !pip install transformers datasets torch evaluate
# clear_output()

In [2]:
# importing libraries 
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Plots
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold

In [3]:
# here in this dataset I found two datasets for training and testing purpose seprately
keep_ratio = 0.4
df_train = pd.read_csv(r'data\TextClassification\Yelp\train.csv', header=None)
df_train.dropna(inplace=True)
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train = df_train.iloc[:int(keep_ratio*df_train.shape[0])]
df_train.columns = ['label', 'text']
df_train['label'] = df_train['label'] - 1
df_test = pd.read_csv(r'data\TextClassification\Yelp\test.csv', header=None)
df_test.dropna(inplace=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
df_test = df_test.iloc[:int(keep_ratio*df_test.shape[0])]
df_test.columns = ['label', 'text']
df_test['label'] = df_test['label'] - 1

In [4]:
def combine_title_and_description(df):
    # Returns a dataset with the title and description fields combined
    df['text'] = df[['Title', 'Description']].agg('. '.join, axis=1)
    df = df.drop(['Title', 'Description'], axis=1)
    return df

In [5]:
#Tockenize and removing stopwords,punctuations and other irrelevant characters  
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return " ".join(words)


[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [6]:
df_train['text'] = df_train['text'].apply(remove_stopwords)

In [7]:
df_test['text']=df_test['text'].apply(remove_stopwords)

In [8]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/224000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15200 [00:00<?, ? examples/s]

In [11]:
# Set the format for PyTorch
train_dataset = train_dataset.rename_column('label', 'labels')
test_dataset = test_dataset.rename_column('label', 'labels')
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [16]:
training_args = TrainingArguments(
    output_dir=r'logs/OtherModels/bert_ag_results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    overwrite_output_dir=True,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=3,
    save_strategy="no"
)

# Load the metrics
import evaluate;
accuracy_metric = evaluate.load('accuracy', trust_remote_code=True)
precision_metric = evaluate.load('precision', trust_remote_code=True)
recall_metric = evaluate.load('recall', trust_remote_code=True)

def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="macro")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")
    
    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision['precision'],
        'recall': recall['recall']
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--accuracy\f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat May 18 18:36:55 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--precision\4e7f439a346715f68500ce6f2be82bf3272abd3f20bdafd203a2c4f85b61dd5f (last modified on Sat May 18 18:37:00 2024) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--recall\e40e6e98d18ff3f210f4d0b26fa721bfaa80704b1fdf890fa551cfabf94fc185 (last modified on Sat May 18 18:37:06 2024) since it couldn't be found locally

In [18]:
torch.cuda.empty_cache()
import gc
gc.collect()

0

In [19]:
# Train the model
# trainer.train(resume_from_checkpoint=r"logs/OtherModels/bert_ag_results/last_epoch")
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

  0%|          | 0/56000 [00:00<?, ?it/s]

{'loss': 0.1342, 'learning_rate': 1.9821428571428575e-05, 'epoch': 0.04}
{'loss': 0.1261, 'learning_rate': 1.9642857142857145e-05, 'epoch': 0.09}
{'loss': 0.114, 'learning_rate': 1.9464285714285715e-05, 'epoch': 0.13}
{'loss': 0.1056, 'learning_rate': 1.928571428571429e-05, 'epoch': 0.18}
{'loss': 0.1136, 'learning_rate': 1.910714285714286e-05, 'epoch': 0.22}
{'loss': 0.1125, 'learning_rate': 1.892857142857143e-05, 'epoch': 0.27}
{'loss': 0.1003, 'learning_rate': 1.8750000000000002e-05, 'epoch': 0.31}
{'loss': 0.1018, 'learning_rate': 1.8571428571428575e-05, 'epoch': 0.36}
{'loss': 0.1021, 'learning_rate': 1.8392857142857142e-05, 'epoch': 0.4}
{'loss': 0.0988, 'learning_rate': 1.8214285714285715e-05, 'epoch': 0.45}
{'loss': 0.1016, 'learning_rate': 1.803571428571429e-05, 'epoch': 0.49}
{'loss': 0.0914, 'learning_rate': 1.785714285714286e-05, 'epoch': 0.54}
{'loss': 0.1024, 'learning_rate': 1.767857142857143e-05, 'epoch': 0.58}
{'loss': 0.0957, 'learning_rate': 1.7500000000000002e-05, '

  0%|          | 0/760 [00:00<?, ?it/s]

{'eval_loss': 0.12155628949403763, 'eval_accuracy': 0.9582894736842106, 'eval_precision': 0.9583962409862237, 'eval_recall': 0.9582806894171718, 'eval_runtime': 85.096, 'eval_samples_per_second': 178.622, 'eval_steps_per_second': 8.931, 'epoch': 1.0}
{'loss': 0.0955, 'learning_rate': 1.5892857142857142e-05, 'epoch': 1.03}
{'loss': 0.0619, 'learning_rate': 1.5714285714285715e-05, 'epoch': 1.07}
{'loss': 0.0622, 'learning_rate': 1.553571428571429e-05, 'epoch': 1.12}
{'loss': 0.0653, 'learning_rate': 1.535714285714286e-05, 'epoch': 1.16}
{'loss': 0.0676, 'learning_rate': 1.5178571428571429e-05, 'epoch': 1.21}
{'loss': 0.0771, 'learning_rate': 1.5000000000000002e-05, 'epoch': 1.25}
{'loss': 0.0635, 'learning_rate': 1.4821428571428574e-05, 'epoch': 1.29}
{'loss': 0.0698, 'learning_rate': 1.4642857142857144e-05, 'epoch': 1.34}
{'loss': 0.0673, 'learning_rate': 1.4464285714285715e-05, 'epoch': 1.38}
{'loss': 0.0645, 'learning_rate': 1.4285714285714287e-05, 'epoch': 1.43}
{'loss': 0.0726, 'lea

  0%|          | 0/760 [00:00<?, ?it/s]

{'eval_loss': 0.15063881874084473, 'eval_accuracy': 0.9580263157894737, 'eval_precision': 0.9580292686389447, 'eval_recall': 0.9580281267361196, 'eval_runtime': 81.399, 'eval_samples_per_second': 186.734, 'eval_steps_per_second': 9.337, 'epoch': 2.0}
{'loss': 0.0658, 'learning_rate': 1.1964285714285716e-05, 'epoch': 2.01}
{'loss': 0.0399, 'learning_rate': 1.1785714285714287e-05, 'epoch': 2.05}
{'loss': 0.0368, 'learning_rate': 1.1607142857142859e-05, 'epoch': 2.1}
{'loss': 0.031, 'learning_rate': 1.1428571428571429e-05, 'epoch': 2.14}
{'loss': 0.0394, 'learning_rate': 1.125e-05, 'epoch': 2.19}
{'loss': 0.0373, 'learning_rate': 1.1071428571428572e-05, 'epoch': 2.23}
{'loss': 0.0412, 'learning_rate': 1.0892857142857142e-05, 'epoch': 2.28}
{'loss': 0.0362, 'learning_rate': 1.0714285714285714e-05, 'epoch': 2.32}
{'loss': 0.0288, 'learning_rate': 1.0535714285714287e-05, 'epoch': 2.37}
{'loss': 0.0371, 'learning_rate': 1.0357142857142859e-05, 'epoch': 2.41}
{'loss': 0.0457, 'learning_rate': 

  0%|          | 0/760 [00:00<?, ?it/s]

{'eval_loss': 0.22479350864887238, 'eval_accuracy': 0.9582236842105263, 'eval_precision': 0.9582258099254155, 'eval_recall': 0.958222612812182, 'eval_runtime': 83.046, 'eval_samples_per_second': 183.031, 'eval_steps_per_second': 9.152, 'epoch': 3.0}
{'loss': 0.0187, 'learning_rate': 7.857142857142858e-06, 'epoch': 3.04}
{'loss': 0.0148, 'learning_rate': 7.67857142857143e-06, 'epoch': 3.08}
{'loss': 0.0145, 'learning_rate': 7.500000000000001e-06, 'epoch': 3.12}
{'loss': 0.0228, 'learning_rate': 7.321428571428572e-06, 'epoch': 3.17}
{'loss': 0.0276, 'learning_rate': 7.1428571428571436e-06, 'epoch': 3.21}
{'loss': 0.0202, 'learning_rate': 6.964285714285714e-06, 'epoch': 3.26}
{'loss': 0.0185, 'learning_rate': 6.785714285714287e-06, 'epoch': 3.3}
{'loss': 0.0229, 'learning_rate': 6.607142857142858e-06, 'epoch': 3.35}
{'loss': 0.0227, 'learning_rate': 6.4285714285714295e-06, 'epoch': 3.39}
{'loss': 0.0208, 'learning_rate': 6.25e-06, 'epoch': 3.44}
{'loss': 0.0191, 'learning_rate': 6.0714285

  0%|          | 0/760 [00:00<?, ?it/s]

{'eval_loss': 0.27348482608795166, 'eval_accuracy': 0.9578947368421052, 'eval_precision': 0.9579230041046991, 'eval_recall': 0.9578996639520911, 'eval_runtime': 83.254, 'eval_samples_per_second': 182.574, 'eval_steps_per_second': 9.129, 'epoch': 4.0}
{'loss': 0.0103, 'learning_rate': 3.928571428571429e-06, 'epoch': 4.02}
{'loss': 0.008, 'learning_rate': 3.7500000000000005e-06, 'epoch': 4.06}
{'loss': 0.0102, 'learning_rate': 3.5714285714285718e-06, 'epoch': 4.11}
{'loss': 0.0115, 'learning_rate': 3.3928571428571435e-06, 'epoch': 4.15}
{'loss': 0.0094, 'learning_rate': 3.2142857142857147e-06, 'epoch': 4.2}
{'loss': 0.0066, 'learning_rate': 3.0357142857142856e-06, 'epoch': 4.24}
{'loss': 0.0065, 'learning_rate': 2.8571428571428573e-06, 'epoch': 4.29}
{'loss': 0.0086, 'learning_rate': 2.6785714285714285e-06, 'epoch': 4.33}
{'loss': 0.0098, 'learning_rate': 2.5e-06, 'epoch': 4.38}
{'loss': 0.0093, 'learning_rate': 2.321428571428572e-06, 'epoch': 4.42}
{'loss': 0.0119, 'learning_rate': 2.14

  0%|          | 0/760 [00:00<?, ?it/s]

{'eval_loss': 0.2977316975593567, 'eval_accuracy': 0.9589473684210527, 'eval_precision': 0.9589483031244261, 'eval_recall': 0.9589485573897706, 'eval_runtime': 82.588, 'eval_samples_per_second': 184.046, 'eval_steps_per_second': 9.202, 'epoch': 5.0}
{'train_runtime': 15961.08, 'train_samples_per_second': 70.171, 'train_steps_per_second': 3.509, 'train_loss': 0.048435334111963, 'epoch': 5.0}


  0%|          | 0/760 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.2977316975593567, 'eval_accuracy': 0.9589473684210527, 'eval_precision': 0.9589483031244261, 'eval_recall': 0.9589485573897706, 'eval_runtime': 87.27, 'eval_samples_per_second': 174.172, 'eval_steps_per_second': 8.709, 'epoch': 5.0}


In [19]:
trainer.save_model(r"logs/OtherModels/bert_ag_results/last_epoch")

In [None]:
Evaluation results: {'eval_loss': 0.2977316975593567, 'eval_accuracy': 0.9589473684210527, 'eval_precision': 0.9589483031244261, 'eval_recall': 0.9589485573897706, 'eval_runtime': 87.27, 'eval_samples_per_second': 174.172, 'eval_steps_per_second': 8.709, 'epoch': 5.0}