In [1]:
# from IPython.display import clear_output
# !pip install transformers datasets torch evaluate
# clear_output()

In [2]:
# importing libraries 
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Plots
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
import os
from os import path

In [3]:
def load_dataset(dataset_path):
    pos_path = os.path.join(dataset_path, 'pos')
    neg_path = os.path.join(dataset_path, 'neg')
    pos_names = os.listdir(pos_path)
    neg_names = os.listdir(neg_path)
    df = []
    for pname in pos_names:
        file_path = os.path.join(pos_path, pname)
        with open(file_path, 'rt', encoding='utf8') as f:
            df.append([1, f.read()])
    
    for nname in neg_names:
        file_path = os.path.join(neg_path, nname)
        with open(file_path, 'rt', encoding='utf8') as f:
            df.append([0, f.read()])
    df = pd.DataFrame(data=df, columns=['label', 'text'])
    df = df.sample(frac=1).reset_index(drop=True)
    return df

In [4]:
keep_ratio = 1.0
df = load_dataset(r'data\TextClassification\review_polarity')
# df.text = df.text.apply(lambda d: d[:50])
df.dropna(inplace=True)
df = df.iloc[:int(keep_ratio*df.shape[0])]
target_classes = ["Negative", "Positive"]
df.shape

(2000, 2)

In [5]:
# here in this dataset I found two datasets for training and testing purpose seprately

df_train = df.iloc[:int(0.9*df.shape[0])]
df_test = df.iloc[int(0.9*df.shape[0]):]

In [6]:
def combine_title_and_description(df):
    # Returns a dataset with the title and description fields combined
    df['text'] = df[['Title', 'Description']].agg('. '.join, axis=1)
    df = df.drop(['Title', 'Description'], axis=1)
    return df

In [7]:
#Tockenize and removing stopwords,punctuations and other irrelevant characters  
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return " ".join(words)


[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [8]:
df_train['text'] = df_train['text'].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['text'] = df_train['text'].apply(remove_stopwords)


In [9]:
df_test['text']=df_test['text'].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['text']=df_test['text'].apply(remove_stopwords)


In [10]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [11]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [13]:
# Set the format for PyTorch
train_dataset = train_dataset.rename_column('label', 'labels')
test_dataset = test_dataset.rename_column('label', 'labels')
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [14]:
training_args = TrainingArguments(
    output_dir=r'logs/OtherModels/bert_ag_results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    overwrite_output_dir=True,
    num_train_epochs=20,
    weight_decay=0.01,
    save_total_limit=20,
    save_strategy="no"
)

# Load the metrics
import evaluate;
accuracy_metric = evaluate.load('accuracy', trust_remote_code=True)
precision_metric = evaluate.load('precision', trust_remote_code=True)
recall_metric = evaluate.load('recall', trust_remote_code=True)

def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="macro")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")
    
    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision['precision'],
        'recall': recall['recall']
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--accuracy\f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat May 18 18:36:55 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--precision\4e7f439a346715f68500ce6f2be82bf3272abd3f20bdafd203a2c4f85b61dd5f (last modified on Sat May 18 18:37:00 2024) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--recall\e40e6e98d18ff3f210f4d0b26fa721bfaa80704b1fdf890fa551cfabf94fc185 (last modified on Sat May 18 18:37:06 2024) since it couldn't be found locally

In [15]:
torch.cuda.empty_cache()
import gc
gc.collect()

62

In [16]:
# Train the model
# trainer.train(resume_from_checkpoint=r"logs/OtherModels/bert_ag_results/last_epoch")
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.4836529493331909, 'eval_accuracy': 0.785, 'eval_precision': 0.8198294243070363, 'eval_recall': 0.7849999999999999, 'eval_runtime': 1.128, 'eval_samples_per_second': 177.31, 'eval_steps_per_second': 8.866, 'epoch': 1.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.29184597730636597, 'eval_accuracy': 0.89, 'eval_precision': 0.890625, 'eval_recall': 0.89, 'eval_runtime': 1.09, 'eval_samples_per_second': 183.483, 'eval_steps_per_second': 9.174, 'epoch': 2.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3148438334465027, 'eval_accuracy': 0.875, 'eval_precision': 0.8795930762222897, 'eval_recall': 0.875, 'eval_runtime': 1.055, 'eval_samples_per_second': 189.571, 'eval_steps_per_second': 9.479, 'epoch': 3.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3410910367965698, 'eval_accuracy': 0.9, 'eval_precision': 0.904040404040404, 'eval_recall': 0.8999999999999999, 'eval_runtime': 1.091, 'eval_samples_per_second': 183.312, 'eval_steps_per_second': 9.166, 'epoch': 4.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.4322879910469055, 'eval_accuracy': 0.9, 'eval_precision': 0.9006410256410255, 'eval_recall': 0.9, 'eval_runtime': 1.057, 'eval_samples_per_second': 189.208, 'eval_steps_per_second': 9.46, 'epoch': 5.0}
{'loss': 0.2722, 'learning_rate': 1.4444444444444446e-05, 'epoch': 5.56}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.5127644538879395, 'eval_accuracy': 0.89, 'eval_precision': 0.89015606242497, 'eval_recall': 0.89, 'eval_runtime': 1.0589, 'eval_samples_per_second': 188.871, 'eval_steps_per_second': 9.444, 'epoch': 6.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.5681416988372803, 'eval_accuracy': 0.89, 'eval_precision': 0.89015606242497, 'eval_recall': 0.89, 'eval_runtime': 1.072, 'eval_samples_per_second': 186.559, 'eval_steps_per_second': 9.328, 'epoch': 7.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.5785142779350281, 'eval_accuracy': 0.89, 'eval_precision': 0.892512077294686, 'eval_recall': 0.89, 'eval_runtime': 1.101, 'eval_samples_per_second': 181.651, 'eval_steps_per_second': 9.083, 'epoch': 8.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.640349268913269, 'eval_accuracy': 0.885, 'eval_precision': 0.8853468121309178, 'eval_recall': 0.885, 'eval_runtime': 1.049, 'eval_samples_per_second': 190.658, 'eval_steps_per_second': 9.533, 'epoch': 9.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.6455232501029968, 'eval_accuracy': 0.88, 'eval_precision': 0.8801520608243297, 'eval_recall': 0.88, 'eval_runtime': 1.084, 'eval_samples_per_second': 184.498, 'eval_steps_per_second': 9.225, 'epoch': 10.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.6533029079437256, 'eval_accuracy': 0.895, 'eval_precision': 0.8950395039503951, 'eval_recall': 0.895, 'eval_runtime': 1.064, 'eval_samples_per_second': 187.965, 'eval_steps_per_second': 9.398, 'epoch': 11.0}
{'loss': 0.0077, 'learning_rate': 8.888888888888888e-06, 'epoch': 11.11}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.7211131453514099, 'eval_accuracy': 0.89, 'eval_precision': 0.892512077294686, 'eval_recall': 0.89, 'eval_runtime': 1.191, 'eval_samples_per_second': 167.926, 'eval_steps_per_second': 8.396, 'epoch': 12.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.713451623916626, 'eval_accuracy': 0.89, 'eval_precision': 0.89015606242497, 'eval_recall': 0.89, 'eval_runtime': 1.073, 'eval_samples_per_second': 186.393, 'eval_steps_per_second': 9.32, 'epoch': 13.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.6774479150772095, 'eval_accuracy': 0.885, 'eval_precision': 0.8853468121309178, 'eval_recall': 0.885, 'eval_runtime': 1.083, 'eval_samples_per_second': 184.67, 'eval_steps_per_second': 9.234, 'epoch': 14.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.6872986555099487, 'eval_accuracy': 0.895, 'eval_precision': 0.8950395039503951, 'eval_recall': 0.895, 'eval_runtime': 1.086, 'eval_samples_per_second': 184.167, 'eval_steps_per_second': 9.208, 'epoch': 15.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.6924662590026855, 'eval_accuracy': 0.895, 'eval_precision': 0.8950395039503951, 'eval_recall': 0.895, 'eval_runtime': 1.143, 'eval_samples_per_second': 174.978, 'eval_steps_per_second': 8.749, 'epoch': 16.0}
{'loss': 0.0009, 'learning_rate': 3.3333333333333333e-06, 'epoch': 16.67}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.7061649560928345, 'eval_accuracy': 0.895, 'eval_precision': 0.8950395039503951, 'eval_recall': 0.895, 'eval_runtime': 1.082, 'eval_samples_per_second': 184.839, 'eval_steps_per_second': 9.242, 'epoch': 17.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.7137009501457214, 'eval_accuracy': 0.895, 'eval_precision': 0.8950395039503951, 'eval_recall': 0.895, 'eval_runtime': 1.079, 'eval_samples_per_second': 185.356, 'eval_steps_per_second': 9.268, 'epoch': 18.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.704140305519104, 'eval_accuracy': 0.9, 'eval_precision': 0.9, 'eval_recall': 0.9, 'eval_runtime': 1.0771, 'eval_samples_per_second': 185.691, 'eval_steps_per_second': 9.285, 'epoch': 19.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.6951531171798706, 'eval_accuracy': 0.895, 'eval_precision': 0.8950395039503951, 'eval_recall': 0.895, 'eval_runtime': 1.178, 'eval_samples_per_second': 169.779, 'eval_steps_per_second': 8.489, 'epoch': 20.0}
{'train_runtime': 546.728, 'train_samples_per_second': 65.846, 'train_steps_per_second': 3.292, 'train_loss': 0.07803964269244008, 'epoch': 20.0}


  0%|          | 0/10 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.6951531171798706, 'eval_accuracy': 0.895, 'eval_precision': 0.8950395039503951, 'eval_recall': 0.895, 'eval_runtime': 1.715, 'eval_samples_per_second': 116.618, 'eval_steps_per_second': 5.831, 'epoch': 20.0}


In [19]:
trainer.save_model(r"logs/OtherModels/bert_ag_results/last_epoch")

In [None]:
{'eval_loss': 0.3410910367965698, 'eval_accuracy': 0.9, 'eval_precision': 0.904040404040404, 'eval_recall': 0.8999999999999999, 'eval_runtime': 1.091, 'eval_samples_per_second': 183.312, 'eval_steps_per_second': 9.166, 'epoch': 4.0}