In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd

In [5]:
train_df = pd.read_excel("/content/drive/MyDrive/Books/train.xlsx")
eval_df = pd.read_excel("/content/drive/MyDrive/Books/evaluation.xlsx")

In [6]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.2
Looking in indexes: https://pypi.org/simple, https://u

In [9]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import EvalPrediction
from sklearn.metrics import precision_recall_fscore_support, classification_report
from datasets import Dataset
import random

# ... (the rest of the functions remain the same)

# Replace these lines with the actual paths to your train and eval CSV files
# train_df = pd.read_csv("train.csv")
# eval_df = pd.read_csv("eval.csv")
def generate_negatives(df, multiplier=1):
    negative_df = df.copy()
    for _ in range(multiplier):
        negative_df['reason'] = negative_df['reason'].apply(lambda x: ' '.join(random.sample(x.split(), len(x.split()))))
    negative_df['label'] = 0
    return pd.concat([df, negative_df], ignore_index=True)

def preprocess_dataset(df, tokenizer):
    def encode(example):
        inputs = tokenizer(example['text'], example['reason'], padding=True, truncation=True, max_length=512, return_tensors='pt')
        return {k: v.squeeze(0) for k, v in inputs.items()}
    
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(encode, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    return dataset



def compute_metrics(eval_pred: EvalPrediction):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {'precision': precision, 'recall': recall, 'f1': f1}

train_df
eval_df

train_df = generate_negatives(train_df, multiplier=1)

models = ['bert-base-uncased', 'distilbert-base-uncased', 'roberta-base']
model_results = {}

for model_name in models:
    print(f"Training and evaluating {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    train_dataset = preprocess_dataset(train_df, tokenizer)
    eval_dataset = preprocess_dataset(eval_df, tokenizer)
    
    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        logging_dir=f'./logs/{model_name}',
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()  # This line was missing in your code

    # Error Analysis
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    report = classification_report(eval_dataset['label'], preds, output_dict=True)
    model_results[model_name] = report

print("Error analysis:")
for model_name, report in model_results.items():
    print(f"Model: {model_name}")
    print(report)


Training and evaluating bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Map:   0%|          | 0/8244 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1765,2.444418,0.333746,0.989004,0.499075
2,0.1321,3.138164,0.332625,0.991669,0.498159
3,0.0944,3.081343,0.332584,0.986005,0.497395


Training and evaluating distilbert-base-uncased


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier

Map:   0%|          | 0/8244 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1861,2.75972,0.333671,0.987338,0.49878
2,0.1324,2.542922,0.333221,0.989004,0.498488
3,0.0928,2.821664,0.333071,0.988004,0.498194


Training and evaluating roberta-base


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Map:   0%|          | 0/8244 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.5797,0.690646,0.0,0.0,0.0
2,0.5745,0.660808,0.0,0.0,0.0
3,0.5681,0.673144,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Error analysis:
Model: bert-base-uncased
{'0': {'precision': 0.5922330097087378, 'recall': 0.010168361393565594, 'f1-score': 0.019993444772205833, 'support': 5999}, '1': {'precision': 0.3325840170844105, 'recall': 0.9860046651116294, 'f1-score': 0.4973945200874096, 'support': 3001}, 'accuracy': 0.33555555555555555, 'macro avg': {'precision': 0.4624085133965742, 'recall': 0.4980865132525975, 'f1-score': 0.25869398242980773, 'support': 9000}, 'weighted avg': {'precision': 0.5056544956125593, 'recall': 0.33555555555555555, 'f1-score': 0.17918018110786432, 'support': 9000}}
Model: distilbert-base-uncased
{'0': {'precision': 0.6326530612244898, 'recall': 0.01033505584264044, 'f1-score': 0.020337871084139737, 'support': 5999}, '1': {'precision': 0.3330712199505729, 'recall': 0.988003998667111, 'f1-score': 0.49819373267243555, 'support': 3001}, 'accuracy': 0.3363333333333333, 'macro avg': {'precision': 0.4828621405875314, 'recall': 0.49916952725487573, 'f1-score': 0.2592658018782876, 'support

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import EvalPrediction
from sklearn.metrics import precision_recall_fscore_support, classification_report
from datasets import Dataset
import random

# ... (the rest of the functions remain the same)

def generate_negatives(df, multiplier=1):
    positive_df = df[df['label'] == 1]
    negative_df = df[df['label'] == 0].sample(len(positive_df), replace=True)
    balanced_df = pd.concat([positive_df, negative_df], ignore_index=True)
    return balanced_df.sample(frac=1).reset_index(drop=True)

def preprocess_dataset(df, tokenizer):
    def encode(example):
        inputs = tokenizer(example['text'], example['reason'], padding=True, truncation=True, max_length=512, return_tensors='pt')
        return {k: v.squeeze(0) for k, v in inputs.items()}
    
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(encode, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    return dataset



def compute_metrics(eval_pred: EvalPrediction):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {'precision': precision, 'recall': recall, 'f1': f1}

# ... (preprocess_dataset and compute_metrics functions remain the same)

train_df
eval_df

train_df = generate_negatives(train_df, multiplier=1)

models = ['bert-base-uncased', 'distilbert-base-uncased', 'roberta-base']
model_results = {}

for model_name in models:
    print(f"Training and evaluating {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    train_dataset = preprocess_dataset(train_df, tokenizer)
    eval_dataset = preprocess_dataset(eval_df, tokenizer)

    num_pos = len(train_df[train_df['label'] == 1])
    num_neg = len(train_df[train_df['label'] == 0])
    pos_weight = num_neg / num_pos

    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        logging_dir=f'./logs/{model_name}',
        report_to="none",
    )

    model.config.loss_function = 'CrossEntropyLoss'
    model.config.pos_weight = pos_weight

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Error Analysis
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    report = classification_report(eval_dataset['label'], preds, output_dict=True)
    model_results[model_name] = report

print("Error analysis:")
for model_name, report in model_results.items():
    print(f"Model: {model_name}")
    print(report)
#This is to address the issue of poor class balance

In [10]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import EvalPrediction
from sklearn.metrics import precision_recall_fscore_support, classification_report
from datasets import Dataset
import random

# ... (generate_negatives, preprocess_dataset, and compute_metrics functions remain the same)
def generate_negatives(df, multiplier=1):
    positive_df = df[df['label'] == 1]
    negative_df = df[df['label'] == 0].sample(len(positive_df), replace=True)
    balanced_df = pd.concat([positive_df, negative_df], ignore_index=True)
    return balanced_df.sample(frac=1).reset_index(drop=True)

def preprocess_dataset(df, tokenizer):
    def encode(example):
        inputs = tokenizer(example['text'], example['reason'], padding=True, truncation=True, max_length=512, return_tensors='pt')
        return {k: v.squeeze(0) for k, v in inputs.items()}
    
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(encode, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    return dataset



def compute_metrics(eval_pred: EvalPrediction):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {'precision': precision, 'recall': recall, 'f1': f1}



train_df
eval_df

train_df = generate_negatives(train_df, multiplier=1)

models = ['bert-base-uncased', 'distilbert-base-uncased', 'roberta-base', 'sentence-transformers/bert-base-nli-mean-tokens']
model_results = {}

for model_name in models:
    print(f"Training and evaluating {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    train_dataset = preprocess_dataset(train_df, tokenizer)
    eval_dataset = preprocess_dataset(eval_df, tokenizer)

    num_pos = len(train_df[train_df['label'] == 1])
    num_neg = len(train_df[train_df['label'] == 0])
    pos_weight = num_neg / num_pos

    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',
        num_train_epochs=5,  # Increase the number of epochs
        learning_rate=2e-5,  # Fine-tune the learning rate
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        logging_dir=f'./logs/{model_name}',
        report_to="none",
        lr_scheduler_type='cosine',  # Use cosine learning rate scheduler
    )

    model.config.loss_function = 'CrossEntropyLoss'
    model.config.pos_weight = pos_weight

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Error Analysis
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    report = classification_report(eval_dataset['label'], preds, output_dict=True)
    model_results[model_name] = report

print("Error analysis:")
for model_name, report in model_results.items():
    print(f"Model: {model_name}")
    print(report)


Training and evaluating bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Map:   0%|          | 0/4122 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2551,2.892632,0.332924,0.993002,0.498661
2,0.1275,4.043421,0.333259,0.992336,0.498953
3,0.0838,4.13967,0.333371,0.994335,0.499331
4,0.0662,4.34534,0.333408,0.993002,0.499204
5,0.0579,4.527711,0.333333,0.994002,0.499247


Training and evaluating distilbert-base-uncased


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier

Map:   0%|          | 0/4122 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2626,3.250613,0.333184,0.993669,0.499038
2,0.1167,3.734712,0.333221,0.992003,0.498869
3,0.0821,3.919907,0.333743,0.996335,0.5
4,0.0704,4.005634,0.333594,0.994668,0.499623
5,0.057,4.144705,0.333445,0.994668,0.499456


Training and evaluating roberta-base


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Map:   0%|          | 0/4122 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2622,2.874046,0.334414,0.996335,0.500754
2,0.1296,3.738412,0.333778,0.999667,0.500459
3,0.0946,4.016052,0.333593,1.0,0.500292
4,0.082,4.448793,0.333964,0.999334,0.500626
5,0.0719,4.670692,0.334002,0.999334,0.500668


Training and evaluating sentence-transformers/bert-base-nli-mean-tokens


Downloading (…)okenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/bert-base-nli-mean-tokens and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4122 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2571,3.158566,0.33404,0.997667,0.500502
2,0.127,3.908104,0.333445,0.996335,0.499666
3,0.0792,3.804852,0.334002,0.998334,0.500543
4,0.0753,4.205793,0.333817,0.996001,0.500042
5,0.0593,4.481509,0.333929,0.997001,0.500293


Error analysis:
Model: bert-base-uncased
{'0': {'precision': 0.6470588235294118, 'recall': 0.005500916819469912, 'f1-score': 0.010909090909090908, 'support': 5999}, '1': {'precision': 0.3333333333333333, 'recall': 0.9940019993335555, 'f1-score': 0.4992468619246862, 'support': 3001}, 'accuracy': 0.33511111111111114, 'macro avg': {'precision': 0.4901960784313726, 'recall': 0.4997514580765127, 'f1-score': 0.25507797641688856, 'support': 9000}, 'weighted avg': {'precision': 0.5424488017429194, 'recall': 0.33511111111111114, 'f1-score': 0.17374260766662442, 'support': 9000}}
Model: distilbert-base-uncased
{'0': {'precision': 0.6666666666666666, 'recall': 0.005334222370395066, 'f1-score': 0.010583760542417728, 'support': 5999}, '1': {'precision': 0.3334450402144772, 'recall': 0.9946684438520493, 'f1-score': 0.4994562034635656, 'support': 3001}, 'accuracy': 0.3352222222222222, 'macro avg': {'precision': 0.5000558534405719, 'recall': 0.5000013331112222, 'f1-score': 0.2550199820029917, 'support

In [11]:
import pandas as pd

# Prepare the data for the table
data = []
for model_name, report in model_results.items():
    precision = report['1']['precision']
    recall = report['1']['recall']
    f1_score = report['1']['f1-score']
    data.append([model_name, precision, recall, f1_score])

# Create the DataFrame and set the column names
results_df = pd.DataFrame(data, columns=['Model', 'Precision', 'Recall', 'F1 Score'])

# Display the DataFrame
print(results_df)


                                             Model  Precision    Recall  \
0                                bert-base-uncased   0.333333  0.994002   
1                          distilbert-base-uncased   0.333445  0.994668   
2                                     roberta-base   0.334002  0.999334   
3  sentence-transformers/bert-base-nli-mean-tokens   0.333929  0.997001   

   F1 Score  
0  0.499247  
1  0.499456  
2  0.500668  
3  0.500293  


To analyze the errors made by each model and discuss possible reasons for these errors, you can start by examining the misclassified examples. This can be done by comparing the ground truth labels with the model's predictions. You can use the following code snippet to get the misclassified examples for each model:

## Error Analysis

In [12]:
misclassified_examples = {}

for model_name in models:
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    misclassified_indices = np.where(eval_dataset['label'] != preds)[0]
    misclassified_examples[model_name] = eval_df.iloc[misclassified_indices]


In [13]:
for model_name, misclassified_df in misclassified_examples.items():
    print(f"\nMisclassified examples for {model_name}:")
    print(misclassified_df)



Misclassified examples for bert-base-uncased:
                                      text                       reason  label
0  the app is crashing when i play a vedio  app crashes during playback      1

Misclassified examples for distilbert-base-uncased:
                                      text                       reason  label
0  the app is crashing when i play a vedio  app crashes during playback      1

Misclassified examples for roberta-base:
                                      text                       reason  label
0  the app is crashing when i play a vedio  app crashes during playback      1

Misclassified examples for sentence-transformers/bert-base-nli-mean-tokens:
                                      text                       reason  label
0  the app is crashing when i play a vedio  app crashes during playback      1
