In [1]:
!pip install transformers



In [2]:
!pip install accelerate -U #restart the runtime (if running on collab and it throws an import error)



In [3]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_score, recall_score, f1_score
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [6]:
df = pd.read_csv('/content/drive/MyDrive/interIIT/train_df.csv')

In [7]:
df

Unnamed: 0,premise,hypothesis,label
0,angstrom team of research_worker be conducting...,mindfulness research.,1
1,angstrom team of psychologist be research the ...,Nature-based mental health research.,1
2,A team of software developers is creating a la...,AI-powered language education.,1
3,row of book are neatly arrange on the library ...,The library hour_angle form bookshelves.,1
4,A team of researchers is studying the potentia...,Algae-based carbon capture research.,1
...,...,...,...
1009,angstrom team of archeologist be carefully exc...,archaeological discovery.,0
1010,A is mechanic repairing a car's engine.,Vehicle maintenance is taking place.,0
1011,Children riding are on a carousel at the fair.,Kids are enjoying the fair rides.,0
1012,angstrom renewable energy startup development ...,geothermal energy technology innovation.,0


In [8]:
train_df, test_df = train_test_split(df, test_size=0.2)
train_texts = train_df['premise'].tolist()
train_hypotheses = train_df['hypothesis'].tolist()
train_labels = train_df['label'].tolist()
test_texts = test_df['premise'].tolist()
test_hypotheses = test_df['hypothesis'].tolist()
test_labels = test_df['label'].tolist()

In [9]:
train_encodings = tokenizer(train_texts, train_hypotheses, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, test_hypotheses, truncation=True, padding=True)

In [10]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
train_dataset = MyDataset(train_encodings, train_labels)
test_dataset = MyDataset(test_encodings, test_labels)

In [12]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [13]:
training_args = TrainingArguments(
    per_device_train_batch_size=16,
    num_train_epochs=3,
    logging_dir='/content/drive/MyDrive/interIIT/logs',
    output_dir='/content/drive/MyDrive/interIIT/output'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=153, training_loss=0.35074029560961756, metrics={'train_runtime': 17.6587, 'train_samples_per_second': 137.779, 'train_steps_per_second': 8.664, 'total_flos': 30844464581196.0, 'train_loss': 0.35074029560961756, 'epoch': 3.0})

# Evaluation

In [14]:
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=1)

precision = precision_score(labels, predictions)
recall = recall_score(labels, predictions)
f1 = f1_score(labels, predictions)

print(f'Precision: {precision}, Recall: {recall}, F1 Score: {f1}')

Precision: 0.803921568627451, Recall: 0.9213483146067416, F1 Score: 0.8586387434554974


In [21]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

In [22]:
def evaluate_positive_class(y_true, y_pred):
    precision = precision_score(y_true, y_pred, pos_label=1)
    recall = recall_score(y_true, y_pred, pos_label=1)
    f1 = f1_score(y_true, y_pred, pos_label=1)
    accuracy = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    tpr = tp / (tp + fn)
    tnr = tn / (tn + fp)
    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)

    print(f"Precision for Positive Class: {precision}")
    print(f"Recall for Positive Class: {recall}")
    print(f"F1 Score for Positive Class: {f1}")
    print(f"Overall Accuracy: {accuracy}")
    print(f"True Positive Rate: {tpr}")
    print(f"True Negative Rate: {tnr}")
    print(f"False Positive Rate: {fpr}")
    print(f"False Negative Rate: {fnr}")

In [23]:
evaluate_positive_class(labels, predictions)

Precision for Positive Class: 0.803921568627451
Recall for Positive Class: 0.9213483146067416
F1 Score for Positive Class: 0.8586387434554974
Overall Accuracy: 0.8669950738916257
True Positive Rate: 0.9213483146067416
True Negative Rate: 0.8245614035087719
False Positive Rate: 0.17543859649122806
False Negative Rate: 0.07865168539325842


## manually testing the model

In [15]:
def test_model():
    premise = input("Enter the premise: ")
    hypothesis = input("Enter the hypothesis: ")
    encodings = tokenizer(premise, hypothesis, truncation=True, padding=True, return_tensors="pt")
    encodings = {key: tensor.to(device) for key, tensor in encodings.items()}
    with torch.no_grad():
        outputs = model(**encodings)
        logits = outputs.logits
        pred = torch.argmax(logits).item()
    if pred == 1:
        print("The text satisfies the reason.")
    else:
        print("The text does NOT satisfy the reason.")
test_model()


Enter the premise: A dog is running through the field.
Enter the hypothesis: A pet is moving outdoors.
The text satisfies the reason.


In [18]:
test_model()

Enter the premise: A dog is running through the field.
Enter the hypothesis: A dog is standing atill.
The text does NOT satisfy the reason.


In [17]:
test_model()

Enter the premise: The musician played the guitar on stage.
Enter the hypothesis: The mudician is an athlete
The text does NOT satisfy the reason.


In [20]:
test_model()

Enter the premise: The chef is cooking pasta in the kitchen.
Enter the hypothesis: Food is being prepared.
The text satisfies the reason.
