<a href="https://colab.research.google.com/github/Abhitipu/DL-CS60010/blob/main/Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install pytorch-lightning
!pip install datasets
!pip install transformers
!pip install tqdm

In [None]:
from datasets import load_dataset
dataset = load_dataset("banking77")

Using custom data configuration default
Reusing dataset banking77 (/root/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
train_data = dataset['train']
test_data = dataset['test']

print(f"Training data size: {len(train_data)}")
print(f"Test data size: {len(test_data)}")

Training data size: 10003
Test data size: 3080


In [None]:
import numpy as np
import torch
from tqdm import tqdm

In [None]:
from transformers import BertTokenizer, BertModel
model_name = 'bert-base-uncased'

In [None]:
def gen_random_examples(data, labels, n_classes, n_instances):
  '''
    Generates random examples from a dataset 
    n_instances per class for n_classes
    Outputs are plain text now with integer labels
  '''
  assert(n_classes * n_instances <= len(data))

  freq = np.zeros(n_classes)
  complete = 0
  new_data = []
  new_labels = np.zeros(n_classes * n_instances, dtype=np.int32)
  idx = 0
  
  # Randomization
  perm = np.random.permutation(len(labels))
  labels = labels[perm]
  data = [data[i] for i in perm]

  for text, label in zip(data, labels):
    if complete == n_classes:
      break
    if freq[label] < n_instances:
      freq[label] += 1
      complete += (1 if freq[label] == n_instances else 0)
      new_data.append(text)
      new_labels[idx] = label
      idx += 1
    
  return (new_data, new_labels)
  

In [None]:
class BERT_LM:
  def __init__(self, model_name):
    self.tokenizer = BertTokenizer.from_pretrained(model_name)
    self.model = BertModel.from_pretrained(model_name)
  
  def encode(self, text):
    input_ids = torch.tensor(self.tokenizer.encode(text)).unsqueeze(0)
    outputs = self.model(input_ids)
    # using cls_embedding
    last_hidden_states = outputs[0][:,0].detach().numpy().reshape(-1)
    return last_hidden_states

  def get_encodings(self, data):
    encoded_data = np.array([self.encode(text) for text in data])
    return encoded_data

In [None]:
# Distance functions
def cosine_similarity(X, Y):
    cos_theta = np.dot(X, Y) / (np.linalg.norm(X) * np.linalg.norm(Y) + 1e-35)
    return 1 - cos_theta
  
def euclidian_distance(X, Y):
    ans = np.square(X - Y)
    return np.sqrt(np.sum(ans))

In [None]:
class KNN:
  '''
    Implementation of a modified version of weighted KNN algorithm.
    The class labels are given so we dont need to compute centers
  '''
  def __init__(self, n_classes, class_instances, points, labels, distance_function):
    assert(n_classes * class_instances == points.shape[0])
    self.n_classes = n_classes
    self.K = class_instances
    self.points = points
    self.labels = labels
    self.distance_function = distance_function

  def predict(self, test_point):
    '''
      Predict the corresponding class of the self.K closest points
    '''
    total_pts = self.points.shape[0]
    distances = np.array([self.distance_function(test_point, point) for point in self.points])
    indices = self.labels

    sorted_indices = distances.argsort()
    distances = distances[sorted_indices[:self.K]]
    labels = indices[sorted_indices[:self.K]] 
    contributions = np.zeros(self.n_classes)

    for label, distance in zip(labels, distances):
      contributions[label] += 1 / (distance + 1e-15)
    
    # predict the one with max weight
    return np.argmax(contributions[:])


In [None]:
Bert_LM = BERT_LM('bert-base-uncased')
train_labels = np.array([data['label'] for data in train_data], dtype=np.int32)
test_labels = np.array([data['label'] for data in test_data], dtype=np.int32)

N_ITERS = 5
N_CLASSES = 77

for dist_fn in [cosine_similarity, euclidian_distance]:
  for N_INSTANCES in range(1, 4):
    accuracies = []
    for i in range(N_ITERS):

      # Generate encodings for randomly sampled data from training set 
      init_data, init_labels = gen_random_examples(train_data['text'], train_labels, N_CLASSES, N_INSTANCES)
      encoded_init_data = Bert_LM.get_encodings(init_data)

      # Generate encodings for test set as well
      new_test_data, new_test_labels = gen_random_examples(test_data['text'], test_labels, N_CLASSES, N_INSTANCES)
      encoded_test_data = Bert_LM.get_encodings(new_test_data)

      # Initialize the classifier with the sampled training data
      my_knn_classifier = KNN(N_CLASSES, N_INSTANCES, encoded_init_data, init_labels, dist_fn)
      
      # Predict on encoded test data
      num, den = 0, 0
      for text, label in zip(encoded_test_data, new_test_labels):
        den += 1
        pred = my_knn_classifier.predict(text)
        if pred == label:
          num += 1

      accuracies.append(num/den)  
    avg_acc = sum(accuracies) / N_ITERS
    print(f"Avg accuracy for {N_INSTANCES} nearest neighbors using {dist_fn.__name__} is {avg_acc}")
    print(f"Accuracy values are: {accuracies}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Avg accuracy for 1 nearest neighbors using cosine_similarity is 0.11688311688311688
Accuracy values are: [0.07792207792207792, 0.12987012987012986, 0.09090909090909091, 0.16883116883116883, 0.11688311688311688]
Avg accuracy for 2 nearest neighbors using cosine_similarity is 0.15974025974025977
Accuracy values are: [0.17532467532467533, 0.07792207792207792, 0.12987012987012986, 0.2012987012987013, 0.21428571428571427]
Avg accuracy for 3 nearest neighbors using cosine_similarity is 0.20086580086580086
Accuracy values are: [0.24675324675324675, 0.18181818181818182, 0.16017316017316016, 0.22077922077922077, 0.19480519480519481]
Avg accuracy for 1 nearest neighbors using euclidian_distance is 0.13766233766233765
Accuracy values are: [0.09090909090909091, 0.15584415584415584, 0.15584415584415584, 0.16883116883116883, 0.11688311688311688]
Avg accuracy for 2 nearest neighbors using euclidian_distance is 0.1883116883116883
Accuracy values are: [0.14285714285714285, 0.2077922077922078, 0.2207792