<a href="https://colab.research.google.com/github/AliEbadi110/Natural-Language-Processing-Text-Classification-Sample-Projects/blob/main/07_NLP_Transformers_Sentence_Similarity_Custom_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NLP - Transformers - Sentence Similarity - Custom Dataset**

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install evaluate

In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from datasets import Dataset, DatasetDict, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

## 1. Loading Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Datasets/Hackathon/Problem 4/train.csv')
df.head()

Unnamed: 0,SENTENCE A,SENTENCE B,label
0,someone is dirtying an animal,a woman is cleaning a shrimp,0
1,a woman in a black dress is pulling a cart and...,a lady is dressed in black and is carrying a w...,0
2,a person is cutting garlic into pieces with a ...,someone is putting ingredients into a wok,0
3,a woman is rock climbing pausing and calculati...,a man is rock climbing and a city and a bay ar...,0
4,a rabbit is playing with a toy rabbit,there is no bunny playing with a stuffed bunny,1


In [None]:
df.shape

(6694, 3)

In [None]:
df_train, df_val = train_test_split(df, test_size=0.2)

In [None]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "val": Dataset.from_pandas(df_val)
    })

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['SENTENCE A', 'SENTENCE B', 'label', '__index_level_0__'],
        num_rows: 5355
    })
    val: Dataset({
        features: ['SENTENCE A', 'SENTENCE B', 'label', '__index_level_0__'],
        num_rows: 1339
    })
})

In [None]:
dataset['train'][0]

{'SENTENCE A': 'the young kids are posing with a green soccer ball in a park',
 'SENTENCE B': 'four boys are kneeling next to each other in front of a ball',
 'label': 0,
 '__index_level_0__': 1146}

## 2. Preprocessing

In [None]:
model_name = 'bert-base-uncased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
tokenizer(dataset['train'][0]['SENTENCE A'], dataset['train'][0]['SENTENCE B'], truncation=True)

{'input_ids': [101, 1996, 2402, 4268, 2024, 20540, 2007, 1037, 2665, 4715, 3608, 1999, 1037, 2380, 102, 2176, 3337, 2024, 16916, 2279, 2000, 2169, 2060, 1999, 2392, 1997, 1037, 3608, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
def tokenize_func(example):
  return tokenizer(example['SENTENCE A'], example['SENTENCE B'], truncation=True)

In [None]:
tokenized_datasets = dataset.map(tokenize_func, batched=True)
tokenized_datasets

Map:   0%|          | 0/5355 [00:00<?, ? examples/s]

Map:   0%|          | 0/1339 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['SENTENCE A', 'SENTENCE B', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5355
    })
    val: Dataset({
        features: ['SENTENCE A', 'SENTENCE B', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1339
    })
})

## 3. Train and Evaluate Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
metric1 = load_metric('accuracy')
metric2 = load_metric('f1')

def compute_metrics(eval_preds):
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  accuracy = metric1.compute(predictions=predictions, references=labels)["accuracy"]
  f1 = metric2.compute(predictions=predictions, references=labels)["f1"]

  return {"accuracy": accuracy, "f1": f1}

  metric1 = load_metric('accuracy')


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments('my_trainer_dir',
                                  per_device_train_batch_size=32,
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  num_train_epochs=5,
                                  load_best_model_at_end=True,
                                  )

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.210309,0.927558,0.847724
2,No log,0.196807,0.932786,0.857143
3,0.186500,0.249225,0.934279,0.869822
4,0.186500,0.276068,0.933532,0.869693
5,0.186500,0.34807,0.926811,0.859195


TrainOutput(global_step=840, training_loss=0.13225094250270297, metrics={'train_runtime': 277.4343, 'train_samples_per_second': 96.509, 'train_steps_per_second': 3.028, 'total_flos': 632211671595000.0, 'train_loss': 0.13225094250270297, 'epoch': 5.0})

## 5. Evaluate

In [None]:
trainer.evaluate()

{'eval_loss': 0.19680722057819366,
 'eval_accuracy': 0.9327856609410008,
 'eval_f1': 0.8571428571428571,
 'eval_runtime': 3.3711,
 'eval_samples_per_second': 397.197,
 'eval_steps_per_second': 49.835,
 'epoch': 5.0}

In [None]:
val_predictions = trainer.predict(tokenized_datasets['val'])

In [None]:
val_preds = torch.nn.functional.softmax(torch.Tensor(val_predictions.predictions), dim=-1)

In [None]:
val_preds = torch.argmax(val_preds, axis=-1)

In [None]:
print(classification_report(df_val['label'], val_preds.numpy()))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96       998
           1       0.93      0.79      0.86       341

    accuracy                           0.93      1339
   macro avg       0.93      0.89      0.91      1339
weighted avg       0.93      0.93      0.93      1339



In [None]:
print(confusion_matrix(df_val['label'], val_preds.numpy()))

[[979  19]
 [ 71 270]]


## 5. Predict

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/Colab Datasets/Hackathon/Problem 4/test.csv')
df_test.head()

Unnamed: 0,SENTENCE A,SENTENCE B
0,a woman is peeling a potato,a woman is not peeling a potato
1,two boys on a couch are reading a book,two boys on a couch are playing video games
2,the man on stage isnt singing into the microphone,a man in a suit is standing at a microphone an...
3,tom is still in a deep coma,tom is still in a light coma
4,there is no dog turning on the grass and pursu...,a dog is turning on the grass and pursuing a f...


In [None]:
dataset_test = DatasetDict({
    "test": Dataset.from_pandas(df_test)
    })
dataset_test

In [None]:
tokenized_datasets = dataset_test.map(tokenize_func, batched=True)
tokenized_datasets

Map:   0%|          | 0/744 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['SENTENCE A', 'SENTENCE B', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 744
    })
})

In [None]:
predictions = trainer.predict(tokenized_datasets['test'])

In [None]:
preds = torch.nn.functional.softmax(torch.Tensor(predictions.predictions), dim=-1)

In [None]:
preds = torch.argmax(preds, axis=-1)

In [None]:
df = pd.DataFrame(preds.numpy(), columns=['label'])

In [None]:
df.to_csv("output.csv")