In [1]:
# from IPython.display import clear_output
# !pip install transformers datasets torch evaluate
# clear_output()

In [2]:
# importing libraries 
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Plots
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold

In [3]:
# here in this dataset I found two datasets for training and testing purpose seprately
keep_ratio = 0.4
df_train = pd.read_csv(r'data\TextClassification\Yelp\train.csv', header=None)
df_train.dropna(inplace=True)
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train = df_train.iloc[:int(keep_ratio*df_train.shape[0])]
df_train.columns = ['label', 'text']
df_train['label'] = df_train['label'] - 1
df_test = pd.read_csv(r'data\TextClassification\Yelp\test.csv', header=None)
df_test.dropna(inplace=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
df_test = df_test.iloc[:int(keep_ratio*df_test.shape[0])]
df_test.columns = ['label', 'text']
df_test['label'] = df_test['label'] - 1

In [4]:
def combine_title_and_description(df):
    # Returns a dataset with the title and description fields combined
    df['text'] = df[['Title', 'Description']].agg('. '.join, axis=1)
    df = df.drop(['Title', 'Description'], axis=1)
    return df

In [5]:
#Tockenize and removing stopwords,punctuations and other irrelevant characters  
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return " ".join(words)


[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [6]:
df_train['text'] = df_train['text'].apply(remove_stopwords)

In [7]:
df_test['text']=df_test['text'].apply(remove_stopwords)

In [8]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/224000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15200 [00:00<?, ? examples/s]

In [11]:
# Set the format for PyTorch
train_dataset = train_dataset.rename_column('label', 'labels')
test_dataset = test_dataset.rename_column('label', 'labels')
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [12]:
training_args = TrainingArguments(
    output_dir=r'logs/OtherModels/bert_ag_results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    overwrite_output_dir=True,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=3,
    save_strategy="no"
)

# Load the metrics
import evaluate;
accuracy_metric = evaluate.load('accuracy', trust_remote_code=True)
precision_metric = evaluate.load('precision', trust_remote_code=True)
recall_metric = evaluate.load('recall', trust_remote_code=True)

def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="macro")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")
    
    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision['precision'],
        'recall': recall['recall']
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--accuracy\f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat May 18 18:36:55 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--precision\4e7f439a346715f68500ce6f2be82bf3272abd3f20bdafd203a2c4f85b61dd5f (last modified on Sat May 18 18:37:00 2024) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--recall\e40e6e98d18ff3f210f4d0b26fa721bfaa80704b1fdf890fa551cfabf94fc185 (last modified on Sat May 18 18:37:06 2024) since it couldn't be found locally

In [13]:
torch.cuda.empty_cache()
import gc
gc.collect()

114

In [14]:
# Train the model
# trainer.train(resume_from_checkpoint=r"logs/OtherModels/bert_ag_results/last_epoch")
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

  0%|          | 0/112000 [00:00<?, ?it/s]

{'loss': 0.3471, 'learning_rate': 1.9910714285714287e-05, 'epoch': 0.02}
{'loss': 0.2848, 'learning_rate': 1.9821428571428575e-05, 'epoch': 0.04}
{'loss': 0.2605, 'learning_rate': 1.973214285714286e-05, 'epoch': 0.07}
{'loss': 0.2468, 'learning_rate': 1.9642857142857145e-05, 'epoch': 0.09}
{'loss': 0.2211, 'learning_rate': 1.955357142857143e-05, 'epoch': 0.11}
{'loss': 0.2309, 'learning_rate': 1.9464285714285715e-05, 'epoch': 0.13}
{'loss': 0.2372, 'learning_rate': 1.9375e-05, 'epoch': 0.16}
{'loss': 0.2203, 'learning_rate': 1.928571428571429e-05, 'epoch': 0.18}
{'loss': 0.2177, 'learning_rate': 1.9196428571428573e-05, 'epoch': 0.2}
{'loss': 0.2173, 'learning_rate': 1.910714285714286e-05, 'epoch': 0.22}
{'loss': 0.2085, 'learning_rate': 1.9017857142857143e-05, 'epoch': 0.25}
{'loss': 0.1976, 'learning_rate': 1.892857142857143e-05, 'epoch': 0.27}
{'loss': 0.1993, 'learning_rate': 1.8839285714285717e-05, 'epoch': 0.29}
{'loss': 0.2131, 'learning_rate': 1.8750000000000002e-05, 'epoch': 0.

  0%|          | 0/1520 [00:00<?, ?it/s]

{'eval_loss': 0.16312259435653687, 'eval_accuracy': 0.954078947368421, 'eval_precision': 0.9541284324240311, 'eval_recall': 0.954064589684348, 'eval_runtime': 168.773, 'eval_samples_per_second': 90.062, 'eval_steps_per_second': 9.006, 'epoch': 1.0}
{'loss': 0.1708, 'learning_rate': 1.598214285714286e-05, 'epoch': 1.0}
{'loss': 0.1071, 'learning_rate': 1.5892857142857142e-05, 'epoch': 1.03}
{'loss': 0.1185, 'learning_rate': 1.580357142857143e-05, 'epoch': 1.05}
{'loss': 0.1324, 'learning_rate': 1.5714285714285715e-05, 'epoch': 1.07}
{'loss': 0.1224, 'learning_rate': 1.5625e-05, 'epoch': 1.09}
{'loss': 0.1237, 'learning_rate': 1.553571428571429e-05, 'epoch': 1.12}
{'loss': 0.1346, 'learning_rate': 1.5446428571428574e-05, 'epoch': 1.14}
{'loss': 0.1254, 'learning_rate': 1.535714285714286e-05, 'epoch': 1.16}
{'loss': 0.129, 'learning_rate': 1.5267857142857144e-05, 'epoch': 1.18}
{'loss': 0.1141, 'learning_rate': 1.5178571428571429e-05, 'epoch': 1.21}
{'loss': 0.1196, 'learning_rate': 1.508

  0%|          | 0/1520 [00:00<?, ?it/s]

{'eval_loss': 0.1704050749540329, 'eval_accuracy': 0.9571052631578948, 'eval_precision': 0.9572977385857298, 'eval_recall': 0.9570758014733203, 'eval_runtime': 168.164, 'eval_samples_per_second': 90.388, 'eval_steps_per_second': 9.039, 'epoch': 2.0}
{'loss': 0.0968, 'learning_rate': 1.1964285714285716e-05, 'epoch': 2.01}
{'loss': 0.0653, 'learning_rate': 1.1875e-05, 'epoch': 2.03}
{'loss': 0.0612, 'learning_rate': 1.1785714285714287e-05, 'epoch': 2.05}
{'loss': 0.0612, 'learning_rate': 1.1696428571428572e-05, 'epoch': 2.08}
{'loss': 0.0708, 'learning_rate': 1.1607142857142859e-05, 'epoch': 2.1}
{'loss': 0.0667, 'learning_rate': 1.1517857142857142e-05, 'epoch': 2.12}
{'loss': 0.0523, 'learning_rate': 1.1428571428571429e-05, 'epoch': 2.14}
{'loss': 0.072, 'learning_rate': 1.1339285714285716e-05, 'epoch': 2.17}
{'loss': 0.0614, 'learning_rate': 1.125e-05, 'epoch': 2.19}
{'loss': 0.0597, 'learning_rate': 1.1160714285714287e-05, 'epoch': 2.21}
{'loss': 0.0639, 'learning_rate': 1.10714285714

  0%|          | 0/1520 [00:00<?, ?it/s]

{'eval_loss': 0.22522953152656555, 'eval_accuracy': 0.9588815789473685, 'eval_precision': 0.9588813766563877, 'eval_recall': 0.9588810032557258, 'eval_runtime': 163.023, 'eval_samples_per_second': 93.238, 'eval_steps_per_second': 9.324, 'epoch': 3.0}
{'loss': 0.0386, 'learning_rate': 7.946428571428571e-06, 'epoch': 3.01}
{'loss': 0.0269, 'learning_rate': 7.857142857142858e-06, 'epoch': 3.04}
{'loss': 0.0207, 'learning_rate': 7.767857142857144e-06, 'epoch': 3.06}
{'loss': 0.0284, 'learning_rate': 7.67857142857143e-06, 'epoch': 3.08}
{'loss': 0.0252, 'learning_rate': 7.589285714285714e-06, 'epoch': 3.1}
{'loss': 0.023, 'learning_rate': 7.500000000000001e-06, 'epoch': 3.12}
{'loss': 0.0251, 'learning_rate': 7.410714285714287e-06, 'epoch': 3.15}
{'loss': 0.0236, 'learning_rate': 7.321428571428572e-06, 'epoch': 3.17}
{'loss': 0.0279, 'learning_rate': 7.232142857142858e-06, 'epoch': 3.19}
{'loss': 0.0263, 'learning_rate': 7.1428571428571436e-06, 'epoch': 3.21}
{'loss': 0.0238, 'learning_rate

  0%|          | 0/1520 [00:00<?, ?it/s]

{'eval_loss': 0.2933151125907898, 'eval_accuracy': 0.9589473684210527, 'eval_precision': 0.9589512995631861, 'eval_recall': 0.958954160089174, 'eval_runtime': 165.224, 'eval_samples_per_second': 91.996, 'eval_steps_per_second': 9.2, 'epoch': 4.0}
{'loss': 0.0097, 'learning_rate': 3.928571428571429e-06, 'epoch': 4.02}
{'loss': 0.0073, 'learning_rate': 3.839285714285715e-06, 'epoch': 4.04}
{'loss': 0.0108, 'learning_rate': 3.7500000000000005e-06, 'epoch': 4.06}
{'loss': 0.0105, 'learning_rate': 3.660714285714286e-06, 'epoch': 4.08}
{'loss': 0.0062, 'learning_rate': 3.5714285714285718e-06, 'epoch': 4.11}
{'loss': 0.0138, 'learning_rate': 3.482142857142857e-06, 'epoch': 4.13}
{'loss': 0.007, 'learning_rate': 3.3928571428571435e-06, 'epoch': 4.15}
{'loss': 0.0058, 'learning_rate': 3.303571428571429e-06, 'epoch': 4.17}
{'loss': 0.0174, 'learning_rate': 3.2142857142857147e-06, 'epoch': 4.2}
{'loss': 0.0134, 'learning_rate': 3.125e-06, 'epoch': 4.22}
{'loss': 0.005, 'learning_rate': 3.03571428

  0%|          | 0/1520 [00:00<?, ?it/s]

{'eval_loss': 0.32755202054977417, 'eval_accuracy': 0.9585526315789473, 'eval_precision': 0.9585608113024959, 'eval_recall': 0.9585474735390149, 'eval_runtime': 164.228, 'eval_samples_per_second': 92.554, 'eval_steps_per_second': 9.255, 'epoch': 5.0}
{'train_runtime': 33984.779, 'train_samples_per_second': 32.956, 'train_steps_per_second': 3.296, 'train_loss': 0.08445435856602021, 'epoch': 5.0}


  0%|          | 0/1520 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.32755202054977417, 'eval_accuracy': 0.9585526315789473, 'eval_precision': 0.9585608113024959, 'eval_recall': 0.9585474735390149, 'eval_runtime': 166.704, 'eval_samples_per_second': 91.18, 'eval_steps_per_second': 9.118, 'epoch': 5.0}


In [19]:
trainer.save_model(r"logs/OtherModels/bert_ag_results/last_epoch")

In [None]:
Evaluation results: {'eval_loss': 0.527237594127655, 'eval_accuracy': 0.92084, 'eval_precision': 0.9209715883094056, 'eval_recall': 0.92084, 'eval_runtime': 239.3086, 'eval_samples_per_second': 104.468, 'eval_steps_per_second': 10.447, 'epoch': 5.0}
