In [1]:
# from IPython.display import clear_output
# !pip install transformers datasets torch evaluate
# clear_output()

In [2]:
# importing libraries 
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Plots
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
import os
from os import path

In [3]:
def load_dataset(dataset_path):
    pos_path = os.path.join(dataset_path, 'pos')
    neg_path = os.path.join(dataset_path, 'neg')
    pos_names = os.listdir(pos_path)
    neg_names = os.listdir(neg_path)
    df = []
    for pname in pos_names:
        file_path = os.path.join(pos_path, pname)
        with open(file_path, 'rt', encoding='utf8') as f:
            df.append([1, f.read()])
    
    for nname in neg_names:
        file_path = os.path.join(neg_path, nname)
        with open(file_path, 'rt', encoding='utf8') as f:
            df.append([0, f.read()])
    df = pd.DataFrame(data=df, columns=['label', 'text'])
    df = df.sample(frac=1).reset_index(drop=True)
    return df

In [4]:
keep_ratio = 1.0
df = load_dataset(r'data\TextClassification\review_polarity')
# df.text = df.text.apply(lambda d: d[:50])
df.dropna(inplace=True)
df = df.iloc[:int(keep_ratio*df.shape[0])]
target_classes = ["Negative", "Positive"]
df.shape

(2000, 2)

In [5]:
# here in this dataset I found two datasets for training and testing purpose seprately

df_train = df.iloc[:int(0.9*df.shape[0])]
df_test = df.iloc[int(0.9*df.shape[0]):]

In [6]:
def combine_title_and_description(df):
    # Returns a dataset with the title and description fields combined
    df['text'] = df[['Title', 'Description']].agg('. '.join, axis=1)
    df = df.drop(['Title', 'Description'], axis=1)
    return df

In [7]:
#Tockenize and removing stopwords,punctuations and other irrelevant characters  
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return " ".join(words)


[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [8]:
df_train['text'] = df_train['text'].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['text'] = df_train['text'].apply(remove_stopwords)


In [9]:
df_test['text']=df_test['text'].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['text']=df_test['text'].apply(remove_stopwords)


In [10]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [13]:
# Set the format for PyTorch
train_dataset = train_dataset.rename_column('label', 'labels')
test_dataset = test_dataset.rename_column('label', 'labels')
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [14]:
training_args = TrainingArguments(
    output_dir=r'logs/OtherModels/bert_ag_results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    overwrite_output_dir=True,
    num_train_epochs=20,
    weight_decay=0.01,
    save_total_limit=20,
    save_strategy="no"
)

# Load the metrics
import evaluate;
accuracy_metric = evaluate.load('accuracy', trust_remote_code=True)
precision_metric = evaluate.load('precision', trust_remote_code=True)
recall_metric = evaluate.load('recall', trust_remote_code=True)

def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="macro")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")
    
    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision['precision'],
        'recall': recall['recall']
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--accuracy\f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat May 18 18:36:55 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--precision\4e7f439a346715f68500ce6f2be82bf3272abd3f20bdafd203a2c4f85b61dd5f (last modified on Sat May 18 18:37:00 2024) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\fardin\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--recall\e40e6e98d18ff3f210f4d0b26fa721bfaa80704b1fdf890fa551cfabf94fc185 (last modified on Sat May 18 18:37:06 2024) since it couldn't be found locally

In [15]:
torch.cuda.empty_cache()
import gc
gc.collect()

61

In [16]:
# Train the model
# trainer.train(resume_from_checkpoint=r"logs/OtherModels/bert_ag_results/last_epoch")
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

  0%|          | 0/3600 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.33075278997421265, 'eval_accuracy': 0.895, 'eval_precision': 0.8994949494949495, 'eval_recall': 0.8955395539553955, 'eval_runtime': 2.311, 'eval_samples_per_second': 86.541, 'eval_steps_per_second': 8.654, 'epoch': 1.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.3787928819656372, 'eval_accuracy': 0.84, 'eval_precision': 0.8492431263515601, 'eval_recall': 0.8391839183918393, 'eval_runtime': 2.151, 'eval_samples_per_second': 92.98, 'eval_steps_per_second': 9.298, 'epoch': 2.0}
{'loss': 0.4513, 'learning_rate': 1.7222222222222224e-05, 'epoch': 2.78}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.5747211575508118, 'eval_accuracy': 0.875, 'eval_precision': 0.875250100040016, 'eval_recall': 0.875137513751375, 'eval_runtime': 2.147, 'eval_samples_per_second': 93.151, 'eval_steps_per_second': 9.315, 'epoch': 3.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.8095542788505554, 'eval_accuracy': 0.855, 'eval_precision': 0.8708333333333333, 'eval_recall': 0.856035603560356, 'eval_runtime': 2.084, 'eval_samples_per_second': 95.969, 'eval_steps_per_second': 9.597, 'epoch': 4.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.6195687651634216, 'eval_accuracy': 0.875, 'eval_precision': 0.875250100040016, 'eval_recall': 0.875137513751375, 'eval_runtime': 2.1431, 'eval_samples_per_second': 93.324, 'eval_steps_per_second': 9.332, 'epoch': 5.0}
{'loss': 0.0944, 'learning_rate': 1.4444444444444446e-05, 'epoch': 5.56}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.7497654557228088, 'eval_accuracy': 0.87, 'eval_precision': 0.8699869986998701, 'eval_recall': 0.8699869986998701, 'eval_runtime': 2.069, 'eval_samples_per_second': 96.665, 'eval_steps_per_second': 9.666, 'epoch': 6.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.8025583624839783, 'eval_accuracy': 0.87, 'eval_precision': 0.8725677991733038, 'eval_recall': 0.8695869586958695, 'eval_runtime': 2.1251, 'eval_samples_per_second': 94.115, 'eval_steps_per_second': 9.411, 'epoch': 7.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.8990502953529358, 'eval_accuracy': 0.865, 'eval_precision': 0.8709415584415584, 'eval_recall': 0.8656365636563657, 'eval_runtime': 2.14, 'eval_samples_per_second': 93.458, 'eval_steps_per_second': 9.346, 'epoch': 8.0}
{'loss': 0.024, 'learning_rate': 1.1666666666666668e-05, 'epoch': 8.33}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.7842501997947693, 'eval_accuracy': 0.89, 'eval_precision': 0.8902011810629566, 'eval_recall': 0.8898889888988899, 'eval_runtime': 2.167, 'eval_samples_per_second': 92.292, 'eval_steps_per_second': 9.229, 'epoch': 9.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.059200644493103, 'eval_accuracy': 0.86, 'eval_precision': 0.8690537084398977, 'eval_recall': 0.8607860786078608, 'eval_runtime': 2.242, 'eval_samples_per_second': 89.206, 'eval_steps_per_second': 8.921, 'epoch': 10.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.8110437989234924, 'eval_accuracy': 0.89, 'eval_precision': 0.8900890089008902, 'eval_recall': 0.8900890089008902, 'eval_runtime': 2.056, 'eval_samples_per_second': 97.274, 'eval_steps_per_second': 9.727, 'epoch': 11.0}
{'loss': 0.0041, 'learning_rate': 8.888888888888888e-06, 'epoch': 11.11}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.9669346809387207, 'eval_accuracy': 0.88, 'eval_precision': 0.87998799879988, 'eval_recall': 0.87998799879988, 'eval_runtime': 2.072, 'eval_samples_per_second': 96.523, 'eval_steps_per_second': 9.652, 'epoch': 12.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.9848620891571045, 'eval_accuracy': 0.88, 'eval_precision': 0.8801921729556601, 'eval_recall': 0.87988798879888, 'eval_runtime': 2.15, 'eval_samples_per_second': 93.023, 'eval_steps_per_second': 9.302, 'epoch': 13.0}
{'loss': 0.0018, 'learning_rate': 6.111111111111112e-06, 'epoch': 13.89}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.1170169115066528, 'eval_accuracy': 0.855, 'eval_precision': 0.8607954545454546, 'eval_recall': 0.8556355635563557, 'eval_runtime': 2.244, 'eval_samples_per_second': 89.126, 'eval_steps_per_second': 8.913, 'epoch': 14.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.0294232368469238, 'eval_accuracy': 0.875, 'eval_precision': 0.8750500200080031, 'eval_recall': 0.874937493749375, 'eval_runtime': 2.056, 'eval_samples_per_second': 97.276, 'eval_steps_per_second': 9.728, 'epoch': 15.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.0867323875427246, 'eval_accuracy': 0.87, 'eval_precision': 0.8706766917293233, 'eval_recall': 0.8697869786978698, 'eval_runtime': 2.122, 'eval_samples_per_second': 94.25, 'eval_steps_per_second': 9.425, 'epoch': 16.0}
{'loss': 0.0041, 'learning_rate': 3.3333333333333333e-06, 'epoch': 16.67}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.0902645587921143, 'eval_accuracy': 0.875, 'eval_precision': 0.875400641025641, 'eval_recall': 0.8748374837483748, 'eval_runtime': 2.097, 'eval_samples_per_second': 95.377, 'eval_steps_per_second': 9.538, 'epoch': 17.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.0943711996078491, 'eval_accuracy': 0.875, 'eval_precision': 0.875400641025641, 'eval_recall': 0.8748374837483748, 'eval_runtime': 2.313, 'eval_samples_per_second': 86.467, 'eval_steps_per_second': 8.647, 'epoch': 18.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.0811272859573364, 'eval_accuracy': 0.87, 'eval_precision': 0.8714702039995981, 'eval_recall': 0.8696869686968697, 'eval_runtime': 2.062, 'eval_samples_per_second': 96.992, 'eval_steps_per_second': 9.699, 'epoch': 19.0}
{'loss': 0.0001, 'learning_rate': 5.555555555555555e-07, 'epoch': 19.44}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.0815297365188599, 'eval_accuracy': 0.87, 'eval_precision': 0.8714702039995981, 'eval_recall': 0.8696869686968697, 'eval_runtime': 2.07, 'eval_samples_per_second': 96.618, 'eval_steps_per_second': 9.662, 'epoch': 20.0}
{'train_runtime': 1136.523, 'train_samples_per_second': 31.676, 'train_steps_per_second': 3.168, 'train_loss': 0.08050686343262593, 'epoch': 20.0}


  0%|          | 0/20 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 1.0815297365188599, 'eval_accuracy': 0.87, 'eval_precision': 0.8714702039995981, 'eval_recall': 0.8696869686968697, 'eval_runtime': 2.7, 'eval_samples_per_second': 74.073, 'eval_steps_per_second': 7.407, 'epoch': 20.0}


In [17]:
trainer.save_model(r"logs/OtherModels/bert_ag_results/last_epoch")

In [18]:
{'eval_loss': 0.33075278997421265, 'eval_accuracy': 0.895, 'eval_precision': 0.8994949494949495, 'eval_recall': 0.8955395539553955, 'eval_runtime': 2.311, 'eval_samples_per_second': 86.541, 'eval_steps_per_second': 8.654, 'epoch': 1.0}


SyntaxError: invalid syntax (551125743.py, line 1)