### Install Dependincies

In [None]:
!pip install --upgrade transformers
!pip install --upgrade accelerate
!pip install torch torchvision torchaudio

In [None]:
!pip show transformers
!pip show accelerate
!pip show torchvision
!pip show torchaudio

Name: transformers
Version: 4.40.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 
Name: accelerate
Version: 0.30.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 
Name: torchvision
Version: 0.17.1+cu121
Summary: image and video datasets and models for torch deep le

### Downloading the Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Download the dataset
!gdown --id 1NdvIddoyYy2idsAWxJ8lodKfD-PZhmyL

Downloading...
From: https://drive.google.com/uc?id=1NdvIddoyYy2idsAWxJ8lodKfD-PZhmyL
To: /content/in_domain_train.tsv
100% 429k/429k [00:00<00:00, 119MB/s]


In [None]:
# Read into a pandas dataframe
df = pd.read_csv("in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])[['label', 'sentence']]
df.head()

Unnamed: 0,label,sentence
0,1,"Our friends won't buy this analysis, let alone..."
1,1,One more pseudo generalization and I'm giving up.
2,1,One more pseudo generalization or I'm giving up.
3,1,"The more we study verbs, the crazier they get."
4,1,Day by day the facts are getting murkier.


In [13]:
df.label.value_counts()

label
1    6023
0    2528
Name: count, dtype: int64

### Model Building, Training and Evaluation

In [None]:
import torch
from transformers import DistilBertTokenizer, AutoTokenizer
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [None]:
X = list(df.sentence)
y = list(df.label)

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
set(y)

{0, 1}

In [None]:
checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=len(set(y)), problem_type="binary_classification")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Lets build custom dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
      text = str(self.texts[idx])
      label = torch.tensor(self.labels[idx])

      encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')
      # encoding = self.tokenizer(text, truncation=True, padding=True, return_tensors='pt')

      return {
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': label,
          # 'loss': torch.tensor(0.0),   # Placeholder for loss calculation
      }

In [None]:
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def binary_classification_metrics(predictions, labels, threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    y_pred = (probs.numpy() >= threshold).astype(int)
    y_true = labels

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, probs.numpy())

    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc
    }

    return metrics



# def compute_metrics(p:EvalPrediction):
#     preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

#     if len(preds.shape) == 2 and preds.shape[1] > 1:
#         # Convert multilabel-indicator targets to single-label format
#         preds = np.argmax(preds, axis=1)
#     else:
#         preds = preds.squeeze()

#     result = binary_classification_metrics(predictions=preds,
#                                   labels=p.label_ids)

#     return result

In [None]:
import os

os.makedirs('/content/results', exist_ok=True)
os.makedirs('/content/logs', exist_ok=True)

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define early stopping callback
early_stopping = EarlyStoppingCallback(early_stopping_patience = 4)  # Adjust patience as needed


# Define custom Trainer class with modified loss function
class TrainerWithCustomLoss(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


# Define evaluation metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


# Define TrainingArguments
args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='/content/results',
    num_train_epochs=8,
    save_steps=300,
    save_total_limit=2,
    warmup_steps=150,
    weight_decay=1e-4,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=300,
    logging_dir='/content/logs',
    learning_rate=1e-5,  # Adjust the learning rate as needed
)


# Initialize Trainer
trainer = TrainerWithCustomLoss(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

In [None]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
300,No log,0.560493,0.717709,0.713607,0.999168,0.832582
600,0.568500,0.525853,0.757452,0.760079,0.956739,0.847145
900,0.568500,0.490038,0.776739,0.809668,0.891847,0.848773
1200,0.482700,0.491,0.781414,0.819444,0.883527,0.85028
1500,0.389700,0.536237,0.787843,0.816127,0.900998,0.856465
1800,0.389700,0.545199,0.802455,0.8375,0.891847,0.86382
2100,0.330100,0.623759,0.800117,0.835938,0.890183,0.862208


TrainOutput(global_step=2100, training_loss=0.43463125864664715, metrics={'train_runtime': 313.2704, 'train_samples_per_second': 174.673, 'train_steps_per_second': 21.834, 'total_flos': 556363074355200.0, 'train_loss': 0.43463125864664715, 'epoch': 2.456140350877193})

In [None]:
trainer.evaluate()

{'eval_loss': 0.49003762006759644,
 'eval_accuracy': 0.7767387492694331,
 'eval_precision': 0.8096676737160121,
 'eval_recall': 0.891846921797005,
 'eval_f1': 0.8487727632620744,
 'eval_runtime': 6.8167,
 'eval_samples_per_second': 251.001,
 'eval_steps_per_second': 31.393,
 'epoch': 2.456140350877193}

In [None]:
trainer.save_model("distilbert-finetuned-binary-classifier")

In [None]:
!zip -r /content/model.zip /content/distilbert-finetuned-binary-classifier

  adding: content/distilbert-finetuned-binary-classifier/ (stored 0%)
  adding: content/distilbert-finetuned-binary-classifier/config.json (deflated 46%)
  adding: content/distilbert-finetuned-binary-classifier/model.safetensors (deflated 8%)
  adding: content/distilbert-finetuned-binary-classifier/training_args.bin (deflated 51%)


In [None]:
text = "give him her sududu."

encoding = tokenizer(text, return_tensors='pt')
encoding.to(trainer.model.device)

outputs = trainer.model(**encoding)

In [None]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0742,  0.1760]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
import numpy as np

# Get the index of the maximum probability
pred_index = np.argmax(outputs)

# Assuming the classes are named 'Class 0' and 'Class 1'
pred_class = 'Class 0' if pred_index == 0 else 'Class 1'

print(pred_class)

Class 0


In [None]:
import torch
import numpy as np

def predict_class(text, tokenizer, model):
    # Tokenize the text
    encoding = tokenizer(text, return_tensors='pt')

    # Move the input tensors to the model's device
    encoding.to(model.device)

    # Get the model's prediction
    with torch.no_grad():
        outputs = model(**encoding)

    # Get the index of the maximum probability
    pred_index = np.argmax(outputs.logits.cpu().numpy())

    # Assuming the classes are named 'Class 0' and 'Class 1'
    pred_class = 'Class 0' if pred_index == 0 else 'Class 1'

    return pred_class

In [None]:
# Example usage
text = "Thank you, see you soon"
predicted_class = predict_class(text, tokenizer, trainer.model)
print(predicted_class)

Class 0


In [None]:
!unzip /content/model-78.zip

Archive:  /content/model-78.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of /content/model-78.zip or
        /content/model-78.zip.zip, and cannot find /content/model-78.zip.ZIP, period.


In [None]:
# Load the model
model = DistilBertForSequenceClassification.from_pretrained('/path/to/your/saved/model')

# Make sure to set the model to evaluation mode
model.eval()