In [1]:
# from IPython.display import clear_output
# !pip install transformers datasets torch evaluate
# clear_output()

In [2]:
# importing libraries 
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Plots
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold

In [6]:
# here in this dataset I found two datasets for training and testing purpose seprately
keep_ratio = 0.1
df_train = pd.read_csv(r'data\TextClassification\AmazonReview\train.csv', header=None)
df_train.dropna(inplace=True)
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train = df_train.iloc[:int(keep_ratio*df_train.shape[0])]
df_train.columns = ['label', 'Title', 'Description']
df_train['label'] = df_train['label'] - 1
df_test = pd.read_csv(r'data\TextClassification\AmazonReview\test.csv', header=None)
df_test.dropna(inplace=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
df_test = df_test.iloc[:int(keep_ratio*df_test.shape[0])]
df_test.columns = ['label', 'Title', 'Description']
df_test['label'] = df_test['label'] - 1

In [7]:
def combine_title_and_description(df):
    # Returns a dataset with the title and description fields combined
    df['text'] = df[['Title', 'Description']].agg('. '.join, axis=1)
    df = df.drop(['Title', 'Description'], axis=1)
    return df

In [12]:
df_train = combine_title_and_description(df_train)
df_test = combine_title_and_description(df_test)
df_train.head()

Unnamed: 0,label,text
0,1,If you're Celiac and miss convenient meals.......
1,0,Not Recommened; Too broad and general. Kind of...
2,0,Price. Perhaps more people would read this boo...
3,0,This item was not for me..But the seller had g...
4,0,Super Overpriced. It's a good product because ...


In [13]:
#Tockenize and removing stopwords,punctuations and other irrelevant characters  
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return " ".join(words)


[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [14]:
df_train['text'] = df_train['text'].apply(remove_stopwords)

In [15]:
df_test['text']=df_test['text'].apply(remove_stopwords)

In [16]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/359979 [00:00<?, ? examples/s]

Map:   0%|          | 0/39997 [00:00<?, ? examples/s]

In [19]:
# Set the format for PyTorch
train_dataset = train_dataset.rename_column('label', 'labels')
test_dataset = test_dataset.rename_column('label', 'labels')
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [21]:
training_args = TrainingArguments(
    output_dir=r'logs/OtherModels/bert_amazon_review',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    overwrite_output_dir=True,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=3,
    save_strategy="no"
)

# Load the metrics
import evaluate;
accuracy_metric = evaluate.load('accuracy', trust_remote_code=True)
precision_metric = evaluate.load('precision', trust_remote_code=True)
recall_metric = evaluate.load('recall', trust_remote_code=True)

def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="macro")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")
    
    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision['precision'],
        'recall': recall['recall']
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--accuracy\f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat May 18 18:36:55 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--precision\4e7f439a346715f68500ce6f2be82bf3272abd3f20bdafd203a2c4f85b61dd5f (last modified on Sat May 18 18:37:00 2024) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--recall\e40e6e98d18ff3f210f4d0b26fa721bfaa80704b1fdf890fa551cfabf94fc185 (last modified on Sat May 18 18:37:06 2024) since it couldn't be found locally

In [22]:
torch.cuda.empty_cache()
import gc
gc.collect()

86

In [23]:
# Train the model
# trainer.train(resume_from_checkpoint=r"logs/OtherModels/bert_ag_results/last_epoch")
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

  0%|          | 0/179990 [00:00<?, ?it/s]

{'loss': 0.38, 'learning_rate': 1.9944441357853216e-05, 'epoch': 0.01}
{'loss': 0.3149, 'learning_rate': 1.988888271570643e-05, 'epoch': 0.03}
{'loss': 0.2922, 'learning_rate': 1.9833324073559642e-05, 'epoch': 0.04}
{'loss': 0.2923, 'learning_rate': 1.977776543141286e-05, 'epoch': 0.06}
{'loss': 0.2839, 'learning_rate': 1.972220678926607e-05, 'epoch': 0.07}
{'loss': 0.2648, 'learning_rate': 1.9666648147119286e-05, 'epoch': 0.08}
{'loss': 0.2542, 'learning_rate': 1.96110895049725e-05, 'epoch': 0.1}
{'loss': 0.2561, 'learning_rate': 1.9555530862825712e-05, 'epoch': 0.11}
{'loss': 0.2462, 'learning_rate': 1.949997222067893e-05, 'epoch': 0.13}
{'loss': 0.2539, 'learning_rate': 1.944441357853214e-05, 'epoch': 0.14}
{'loss': 0.2558, 'learning_rate': 1.9388854936385356e-05, 'epoch': 0.15}
{'loss': 0.2494, 'learning_rate': 1.933329629423857e-05, 'epoch': 0.17}
{'loss': 0.2476, 'learning_rate': 1.9277737652091785e-05, 'epoch': 0.18}
{'loss': 0.2501, 'learning_rate': 1.9222179009945e-05, 'epoch'

  0%|          | 0/4000 [00:00<?, ?it/s]

{'eval_loss': 0.2267047017812729, 'eval_accuracy': 0.9364702352676451, 'eval_precision': 0.9369255373770944, 'eval_recall': 0.9364578146145919, 'eval_runtime': 440.96, 'eval_samples_per_second': 90.704, 'eval_steps_per_second': 9.071, 'epoch': 1.0}
{'loss': 0.1935, 'learning_rate': 1.5999777765431415e-05, 'epoch': 1.0}
{'loss': 0.1577, 'learning_rate': 1.5944219123284627e-05, 'epoch': 1.01}
{'loss': 0.1638, 'learning_rate': 1.5888660481137845e-05, 'epoch': 1.03}
{'loss': 0.1819, 'learning_rate': 1.5833101838991056e-05, 'epoch': 1.04}
{'loss': 0.1484, 'learning_rate': 1.577754319684427e-05, 'epoch': 1.06}
{'loss': 0.167, 'learning_rate': 1.5721984554697485e-05, 'epoch': 1.07}
{'loss': 0.1582, 'learning_rate': 1.5666425912550696e-05, 'epoch': 1.08}
{'loss': 0.1533, 'learning_rate': 1.5610867270403914e-05, 'epoch': 1.1}
{'loss': 0.1593, 'learning_rate': 1.5555308628257126e-05, 'epoch': 1.11}
{'loss': 0.1635, 'learning_rate': 1.549974998611034e-05, 'epoch': 1.13}
{'loss': 0.1571, 'learning

  0%|          | 0/4000 [00:00<?, ?it/s]

{'eval_loss': 0.23643724620342255, 'eval_accuracy': 0.9423456759256944, 'eval_precision': 0.9424980897609315, 'eval_recall': 0.9423529661449366, 'eval_runtime': 435.445, 'eval_samples_per_second': 91.853, 'eval_steps_per_second': 9.186, 'epoch': 2.0}
{'loss': 0.181, 'learning_rate': 1.1999555530862827e-05, 'epoch': 2.0}
{'loss': 0.1123, 'learning_rate': 1.194399688871604e-05, 'epoch': 2.01}
{'loss': 0.0986, 'learning_rate': 1.1888438246569253e-05, 'epoch': 2.03}
{'loss': 0.1012, 'learning_rate': 1.183287960442247e-05, 'epoch': 2.04}
{'loss': 0.089, 'learning_rate': 1.1777320962275682e-05, 'epoch': 2.06}
{'loss': 0.0984, 'learning_rate': 1.1721762320128897e-05, 'epoch': 2.07}
{'loss': 0.1164, 'learning_rate': 1.166620367798211e-05, 'epoch': 2.08}
{'loss': 0.102, 'learning_rate': 1.1610645035835326e-05, 'epoch': 2.1}
{'loss': 0.1223, 'learning_rate': 1.155508639368854e-05, 'epoch': 2.11}
{'loss': 0.0987, 'learning_rate': 1.1499527751541752e-05, 'epoch': 2.13}
{'loss': 0.1071, 'learning_r

  0%|          | 0/4000 [00:00<?, ?it/s]

{'eval_loss': 0.24625465273857117, 'eval_accuracy': 0.942445683426257, 'eval_precision': 0.9424456568485273, 'eval_recall': 0.9424459201433439, 'eval_runtime': 474.8781, 'eval_samples_per_second': 84.226, 'eval_steps_per_second': 8.423, 'epoch': 3.0}
{'loss': 0.1187, 'learning_rate': 7.99933329629424e-06, 'epoch': 3.0}
{'loss': 0.0585, 'learning_rate': 7.943774654147454e-06, 'epoch': 3.01}
{'loss': 0.055, 'learning_rate': 7.888216012000669e-06, 'epoch': 3.03}
{'loss': 0.0543, 'learning_rate': 7.832657369853881e-06, 'epoch': 3.04}
{'loss': 0.0623, 'learning_rate': 7.777098727707096e-06, 'epoch': 3.06}
{'loss': 0.0439, 'learning_rate': 7.721540085560309e-06, 'epoch': 3.07}
{'loss': 0.0618, 'learning_rate': 7.665981443413524e-06, 'epoch': 3.08}
{'loss': 0.055, 'learning_rate': 7.610422801266738e-06, 'epoch': 3.1}
{'loss': 0.0629, 'learning_rate': 7.554864159119951e-06, 'epoch': 3.11}
{'loss': 0.0675, 'learning_rate': 7.499305516973165e-06, 'epoch': 3.13}
{'loss': 0.0576, 'learning_rate': 

  0%|          | 0/4000 [00:00<?, ?it/s]

{'eval_loss': 0.3449901342391968, 'eval_accuracy': 0.9420206515488662, 'eval_precision': 0.9420749126267891, 'eval_recall': 0.942016450467358, 'eval_runtime': 433.228, 'eval_samples_per_second': 92.323, 'eval_steps_per_second': 9.233, 'epoch': 4.0}
{'loss': 0.06, 'learning_rate': 3.999111061725652e-06, 'epoch': 4.0}
{'loss': 0.0266, 'learning_rate': 3.943552419578866e-06, 'epoch': 4.01}
{'loss': 0.0311, 'learning_rate': 3.88799377743208e-06, 'epoch': 4.03}
{'loss': 0.0367, 'learning_rate': 3.832435135285294e-06, 'epoch': 4.04}
{'loss': 0.0231, 'learning_rate': 3.776876493138508e-06, 'epoch': 4.06}
{'loss': 0.0254, 'learning_rate': 3.721317850991722e-06, 'epoch': 4.07}
{'loss': 0.0303, 'learning_rate': 3.665759208844936e-06, 'epoch': 4.08}
{'loss': 0.0253, 'learning_rate': 3.6102005666981503e-06, 'epoch': 4.1}
{'loss': 0.0258, 'learning_rate': 3.554641924551364e-06, 'epoch': 4.11}
{'loss': 0.0332, 'learning_rate': 3.4990832824045783e-06, 'epoch': 4.13}
{'loss': 0.0251, 'learning_rate': 

  0%|          | 0/4000 [00:00<?, ?it/s]

{'eval_loss': 0.3865019679069519, 'eval_accuracy': 0.9425956946771008, 'eval_precision': 0.9425958042874543, 'eval_recall': 0.9425960477518112, 'eval_runtime': 442.543, 'eval_samples_per_second': 90.38, 'eval_steps_per_second': 9.039, 'epoch': 5.0}
{'train_runtime': 56025.9744, 'train_samples_per_second': 32.126, 'train_steps_per_second': 3.213, 'train_loss': 0.11859427891649718, 'epoch': 5.0}


  0%|          | 0/4000 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.3865019679069519, 'eval_accuracy': 0.9425956946771008, 'eval_precision': 0.9425958042874543, 'eval_recall': 0.9425960477518112, 'eval_runtime': 443.4421, 'eval_samples_per_second': 90.197, 'eval_steps_per_second': 9.02, 'epoch': 5.0}


In [19]:
trainer.save_model(r"logs/OtherModels/bert_ag_results/last_epoch")

In [None]:
Evaluation results: {'eval_loss': 0.3865019679069519, 'eval_accuracy': 0.9425956946771008, 'eval_precision': 0.9425958042874543, 'eval_recall': 0.9425960477518112, 'eval_runtime': 443.4421, 'eval_samples_per_second': 90.197, 'eval_steps_per_second': 9.02, 'epoch': 5.0}
