In [1]:
# from IPython.display import clear_output
# !pip install transformers datasets torch evaluate
# clear_output()

In [2]:
# importing libraries 
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Plots
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold

In [3]:
# here in this dataset I found two datasets for training and testing purpose seprately
keep_ratio = 0.1
df_train = pd.read_csv(r'data\TextClassification\AmazonReview\train.csv', header=None)
df_train.dropna(inplace=True)
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train = df_train.iloc[:int(keep_ratio*df_train.shape[0])]
df_train.columns = ['label', 'Title', 'Description']
df_train['label'] = df_train['label'] - 1
df_test = pd.read_csv(r'data\TextClassification\AmazonReview\test.csv', header=None)
df_test.dropna(inplace=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
df_test = df_test.iloc[:int(keep_ratio*df_test.shape[0])]
df_test.columns = ['label', 'Title', 'Description']
df_test['label'] = df_test['label'] - 1

In [4]:
def combine_title_and_description(df):
    # Returns a dataset with the title and description fields combined
    df['text'] = df[['Title', 'Description']].agg('. '.join, axis=1)
    df = df.drop(['Title', 'Description'], axis=1)
    return df

In [5]:
df_train = combine_title_and_description(df_train)
df_test = combine_title_and_description(df_test)
df_train.head()

Unnamed: 0,label,text
0,1,FANTASTIC!!. This remote does exactly what it ...
1,0,second rate book. this book is second rate gar...
2,0,Horrible toy. My daughter recieved this from m...
3,1,Wonderful product!. This product really works!...
4,0,Broken Welds. I must say that this companies q...


In [6]:
#Tockenize and removing stopwords,punctuations and other irrelevant characters  
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return " ".join(words)


[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [7]:
df_train['text'] = df_train['text'].apply(remove_stopwords)

In [8]:
df_test['text']=df_test['text'].apply(remove_stopwords)

In [9]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [10]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/359979 [00:00<?, ? examples/s]

Map:   0%|          | 0/39997 [00:00<?, ? examples/s]

In [12]:
# Set the format for PyTorch
train_dataset = train_dataset.rename_column('label', 'labels')
test_dataset = test_dataset.rename_column('label', 'labels')
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [13]:
training_args = TrainingArguments(
    output_dir=r'logs/OtherModels/bert_amazon_review',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    overwrite_output_dir=True,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=3,
    save_strategy="no"
)

# Load the metrics
import evaluate;
accuracy_metric = evaluate.load('accuracy', trust_remote_code=True)
precision_metric = evaluate.load('precision', trust_remote_code=True)
recall_metric = evaluate.load('recall', trust_remote_code=True)

def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="macro")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")
    
    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision['precision'],
        'recall': recall['recall']
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--accuracy\f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat May 18 18:36:55 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--precision\4e7f439a346715f68500ce6f2be82bf3272abd3f20bdafd203a2c4f85b61dd5f (last modified on Sat May 18 18:37:00 2024) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--recall\e40e6e98d18ff3f210f4d0b26fa721bfaa80704b1fdf890fa551cfabf94fc185 (last modified on Sat May 18 18:37:06 2024) since it couldn't be found locally

In [14]:
torch.cuda.empty_cache()
import gc
gc.collect()

61

In [15]:
# Train the model
# trainer.train(resume_from_checkpoint=r"logs/OtherModels/bert_ag_results/last_epoch")
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

  0%|          | 0/89995 [00:00<?, ?it/s]

{'loss': 0.3345, 'learning_rate': 1.988888271570643e-05, 'epoch': 0.03}
{'loss': 0.2625, 'learning_rate': 1.977776543141286e-05, 'epoch': 0.06}
{'loss': 0.2492, 'learning_rate': 1.9666648147119286e-05, 'epoch': 0.08}
{'loss': 0.2395, 'learning_rate': 1.9555530862825712e-05, 'epoch': 0.11}
{'loss': 0.2293, 'learning_rate': 1.944441357853214e-05, 'epoch': 0.14}
{'loss': 0.2181, 'learning_rate': 1.933329629423857e-05, 'epoch': 0.17}
{'loss': 0.2199, 'learning_rate': 1.9222179009945e-05, 'epoch': 0.19}
{'loss': 0.2103, 'learning_rate': 1.911106172565143e-05, 'epoch': 0.22}
{'loss': 0.2099, 'learning_rate': 1.8999944441357855e-05, 'epoch': 0.25}
{'loss': 0.2007, 'learning_rate': 1.888882715706428e-05, 'epoch': 0.28}
{'loss': 0.2118, 'learning_rate': 1.877770987277071e-05, 'epoch': 0.31}
{'loss': 0.2034, 'learning_rate': 1.866659258847714e-05, 'epoch': 0.33}
{'loss': 0.2023, 'learning_rate': 1.855547530418357e-05, 'epoch': 0.36}
{'loss': 0.2008, 'learning_rate': 1.8444358019889995e-05, 'epoc

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.16372115910053253, 'eval_accuracy': 0.9394204565342401, 'eval_precision': 0.939449156565473, 'eval_recall': 0.9394058987727356, 'eval_runtime': 241.3142, 'eval_samples_per_second': 165.747, 'eval_steps_per_second': 8.288, 'epoch': 1.0}
{'loss': 0.1738, 'learning_rate': 1.5999777765431415e-05, 'epoch': 1.0}
{'loss': 0.128, 'learning_rate': 1.5888660481137845e-05, 'epoch': 1.03}
{'loss': 0.1321, 'learning_rate': 1.577754319684427e-05, 'epoch': 1.06}
{'loss': 0.1316, 'learning_rate': 1.5666425912550696e-05, 'epoch': 1.08}
{'loss': 0.1363, 'learning_rate': 1.5555308628257126e-05, 'epoch': 1.11}
{'loss': 0.1271, 'learning_rate': 1.5444191343963555e-05, 'epoch': 1.14}
{'loss': 0.1342, 'learning_rate': 1.5333074059669984e-05, 'epoch': 1.17}
{'loss': 0.1294, 'learning_rate': 1.5221956775376412e-05, 'epoch': 1.19}
{'loss': 0.1406, 'learning_rate': 1.5110839491082838e-05, 'epoch': 1.22}
{'loss': 0.1345, 'learning_rate': 1.4999722206789267e-05, 'epoch': 1.25}
{'loss': 0.1404, 'lea

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.16300739347934723, 'eval_accuracy': 0.9428207115533666, 'eval_precision': 0.9428185696123728, 'eval_recall': 0.9428247546041171, 'eval_runtime': 226.383, 'eval_samples_per_second': 176.678, 'eval_steps_per_second': 8.835, 'epoch': 2.0}
{'loss': 0.1296, 'learning_rate': 1.1999555530862827e-05, 'epoch': 2.0}
{'loss': 0.0825, 'learning_rate': 1.1888438246569253e-05, 'epoch': 2.03}
{'loss': 0.0934, 'learning_rate': 1.1777320962275682e-05, 'epoch': 2.06}
{'loss': 0.0961, 'learning_rate': 1.166620367798211e-05, 'epoch': 2.08}
{'loss': 0.0924, 'learning_rate': 1.155508639368854e-05, 'epoch': 2.11}
{'loss': 0.0945, 'learning_rate': 1.1443969109394969e-05, 'epoch': 2.14}
{'loss': 0.0847, 'learning_rate': 1.1332851825101396e-05, 'epoch': 2.17}
{'loss': 0.0883, 'learning_rate': 1.1221734540807822e-05, 'epoch': 2.19}
{'loss': 0.0925, 'learning_rate': 1.1110617256514251e-05, 'epoch': 2.22}
{'loss': 0.0953, 'learning_rate': 1.0999499972220679e-05, 'epoch': 2.25}
{'loss': 0.0905, 'lea

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.27354946732521057, 'eval_accuracy': 0.9395954696602246, 'eval_precision': 0.9395961783863758, 'eval_recall': 0.9396034329319265, 'eval_runtime': 226.103, 'eval_samples_per_second': 176.897, 'eval_steps_per_second': 8.846, 'epoch': 3.0}
{'loss': 0.0788, 'learning_rate': 7.99933329629424e-06, 'epoch': 3.0}
{'loss': 0.0618, 'learning_rate': 7.888216012000669e-06, 'epoch': 3.03}
{'loss': 0.0576, 'learning_rate': 7.777098727707096e-06, 'epoch': 3.06}
{'loss': 0.0556, 'learning_rate': 7.665981443413524e-06, 'epoch': 3.08}
{'loss': 0.0593, 'learning_rate': 7.554864159119951e-06, 'epoch': 3.11}
{'loss': 0.0631, 'learning_rate': 7.44374687482638e-06, 'epoch': 3.14}
{'loss': 0.0603, 'learning_rate': 7.332629590532808e-06, 'epoch': 3.17}
{'loss': 0.0546, 'learning_rate': 7.221512306239236e-06, 'epoch': 3.19}
{'loss': 0.0605, 'learning_rate': 7.110395021945664e-06, 'epoch': 3.22}
{'loss': 0.0579, 'learning_rate': 6.999277737652093e-06, 'epoch': 3.25}
{'loss': 0.0552, 'learning_rate

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.2911435663700104, 'eval_accuracy': 0.9399704977873341, 'eval_precision': 0.9401511525544226, 'eval_recall': 0.9400146389388473, 'eval_runtime': 222.931, 'eval_samples_per_second': 179.414, 'eval_steps_per_second': 8.971, 'epoch': 4.0}
{'loss': 0.0643, 'learning_rate': 3.999111061725652e-06, 'epoch': 4.0}
{'loss': 0.0334, 'learning_rate': 3.88799377743208e-06, 'epoch': 4.03}
{'loss': 0.0391, 'learning_rate': 3.776876493138508e-06, 'epoch': 4.06}
{'loss': 0.0383, 'learning_rate': 3.665759208844936e-06, 'epoch': 4.08}
{'loss': 0.0446, 'learning_rate': 3.554641924551364e-06, 'epoch': 4.11}
{'loss': 0.034, 'learning_rate': 3.4435246402577925e-06, 'epoch': 4.14}
{'loss': 0.0326, 'learning_rate': 3.3324073559642205e-06, 'epoch': 4.17}
{'loss': 0.0351, 'learning_rate': 3.2212900716706486e-06, 'epoch': 4.19}
{'loss': 0.0401, 'learning_rate': 3.1101727873770766e-06, 'epoch': 4.22}
{'loss': 0.0387, 'learning_rate': 2.999055503083505e-06, 'epoch': 4.25}
{'loss': 0.0315, 'learning_r

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.34596166014671326, 'eval_accuracy': 0.9401955146635997, 'eval_precision': 0.9401937280755874, 'eval_recall': 0.9402004311418282, 'eval_runtime': 221.32, 'eval_samples_per_second': 180.72, 'eval_steps_per_second': 9.037, 'epoch': 5.0}
{'train_runtime': 26779.0688, 'train_samples_per_second': 67.213, 'train_steps_per_second': 3.361, 'train_loss': 0.1038591234092812, 'epoch': 5.0}


  0%|          | 0/2000 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.34596166014671326, 'eval_accuracy': 0.9401955146635997, 'eval_precision': 0.9401937280755874, 'eval_recall': 0.9402004311418282, 'eval_runtime': 222.039, 'eval_samples_per_second': 180.135, 'eval_steps_per_second': 9.007, 'epoch': 5.0}


In [16]:
trainer.save_model(r"logs/OtherModels/bert_ag_results/last_epoch")

In [1]:
{'eval_loss': 0.16300739347934723, 'eval_accuracy': 0.9428207115533666, 'eval_precision': 0.9428185696123728, 'eval_recall': 0.9428247546041171, 'eval_runtime': 226.383, 'eval_samples_per_second': 176.678, 'eval_steps_per_second': 8.835, 'epoch': 2.0}
