In [1]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModel, AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
import torch.optim.lr_scheduler as lr_scheduler
import copy

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [None]:
id2label = {
    0: "MSA",
    1: "MGH",
    2: "EGY",
    3: "LEV",
    4: "IRQ",
    5: "GLF"

}
label2id = {
    "MSA":0,
    "MGH":1,
    "EGY":2,
    "LEV":3,
    "IRQ":4,
    "GLF":5
}


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    'CAMeL-Lab/bert-base-arabic-camelbert-mix', num_labels=6, id2label=id2label, label2id=label2id
)
second_model = copy.deepcopy((model))
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-mix")


config.json:   0%|          | 0.00/468 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-mix and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/305k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
##uploading data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Replace with your path
df = pd.read_csv('/content/drive/MyDrive/full_cleaned_data.tsv',sep='\t')
##Dataset: follow the paradigm of the typical pytorch dataset
print(len(df))
grouped_df = df.groupby('split')
dfs = {name: group for name, group in grouped_df}
train_df = dfs['train'].sample(n=100)
dev_df = dfs['dev'].sample(n=32)
test_df = dfs['test'].sample(n=64)

225068


In [None]:
import torch
class ArabicDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, label2id):
        self.encodings = tokenizer(dataframe['text'].values.tolist(),truncation=True, padding=True)
        self.labels = dataframe['dialect'].apply(lambda x: label2id[x]).values.tolist()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item, idx

    def __len__(self):
        return len(self.labels)

In [None]:
trainset = ArabicDataset(train_df, tokenizer,label2id)
train_loader = DataLoader(trainset, batch_size = 8, shuffle = True)
devset = ArabicDataset(dev_df, tokenizer, label2id)
dev_loader = DataLoader(devset, batch_size = 8, shuffle = True)
testset = ArabicDataset(test_df, tokenizer, label2id)
test_loader = DataLoader(testset, batch_size = 8, shuffle = True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
def train(model,epochs,train_loader,dev_loader,optimizer,lr_scheduler, device):
  model.to(device)
  for epoch in range(epochs):
    model.train()
    print("Start Training:")
    with tqdm(train_loader, unit="batch") as tepoch:
      for batch,index in tepoch:
        optimizer.zero_grad()
        tepoch.set_description(f"Epoch {epoch}")
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids,attention_mask = attention_mask, labels = labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        if lr_scheduler:
          lr_scheduler.step()
        tepoch.set_postfix(loss=loss.item())

    model.eval()
    print("Evaluation:")
    num_right = 0
    num_items = 0
    with tqdm(dev_loader, unit="batch") as depoch:
      for batch,index in depoch:
        depoch.set_description(f"Epoch {epoch}")
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
          output = model(input_ids,attention_mask)
          logits = output.logits
          predictions = torch.argmax(logits, dim = -1)
          correct_num = (predictions == labels).sum()
        num_right += correct_num
        num_items += len(batch['labels'])
      accuracy = num_right / num_items
      print("accuracy= %.3f" %(accuracy))




In [None]:
#Learning configure
optim = Adam(model.parameters(), lr = 1e-5)
scheduler = None
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
train(model, 4, train_loader, dev_loader, optim, scheduler, device)

Start Training:


Epoch 0: 100%|██████████| 80/80 [07:37<00:00,  5.72s/batch, loss=1.05]


Evaluation:


Epoch 0: 100%|██████████| 4/4 [00:04<00:00,  1.07s/batch]


accuracy= 0.625
Start Training:


Epoch 1: 100%|██████████| 80/80 [07:34<00:00,  5.68s/batch, loss=0.619]


Evaluation:


Epoch 1: 100%|██████████| 4/4 [00:04<00:00,  1.08s/batch]


accuracy= 0.625
Start Training:


Epoch 2: 100%|██████████| 80/80 [07:33<00:00,  5.67s/batch, loss=0.865]


Evaluation:


Epoch 2: 100%|██████████| 4/4 [00:04<00:00,  1.12s/batch]


accuracy= 0.750
Start Training:


Epoch 3: 100%|██████████| 80/80 [07:33<00:00,  5.67s/batch, loss=0.545]


Evaluation:


Epoch 3: 100%|██████████| 4/4 [00:03<00:00,  1.06batch/s]

accuracy= 0.719





In [None]:
def get_accuracy(model, loader):
  num_items = 0
  num_correct = 0
  model.eval()
  step = 0
  with torch.no_grad():
    with tqdm(loader, unit="batch") as tepoch:
        for batch,index in tepoch:
          tepoch.set_description(f"Evaluating {step}")
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['labels'].to(device)
          outputs = model(input_ids,attention_mask = attention_mask)
          logits = outputs.logits
          predictions = torch.argmax(logits,dim = -1)
          right = (predictions == labels).sum()
          num = len(input_ids)
          num_items += num
          num_correct += right
          step += 1
    return (num_correct/num_items).item()

print(get_accuracy(model, test_loader))

Evaluating 7: 100%|██████████| 8/8 [00:08<00:00,  1.07s/batch]

0.78125





In [None]:
def generate_upsample_indices(model, dataloader):
  model.eval()
  step = 0
  indices = []
  with torch.no_grad():
    with tqdm(dataloader, unit="batch") as tepoch:
        for batch,index in tepoch:
          tepoch.set_description(f"Evaluating {step}")
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['labels'].to(device)
          outputs = model(input_ids,attention_mask = attention_mask)
          logits = outputs.logits
          predictions = torch.argmax(logits,dim = -1)
          masks = (predictions != labels)
          wrong_indices = index[masks]
          indices+=wrong_indices.tolist()
          step +=1
  return indices

In [None]:
import random
from typing import Iterator, List
import numpy as np
from torch.utils.data import Sampler
class CustomIndicesSampler(Sampler[int]):
    """
    Samples from the specified indices (pass indices - upsampled, downsampled, group balanced etc. to this class)
    Default is no shuffle.
    """
    def __init__(
        self,
        indices: List[int],
        shuffle: bool = False,
    ):
        """
        Samples elements from the specified indices.

        :param indices: The list of indices to sample from.
        :type indices: list[int]
        :param shuffle: Whether to shuffle the indices. Default is False.
        :type shuffle: bool, optional
        """
        self.indices = indices
        self.shuffle = shuffle

    def __iter__(self) -> Iterator[int]:
        """
        Returns an iterator over the sampled indices.

        :return: An iterator over the sampled indices.
        :rtype: iterator[int]
        """
        if self.shuffle:
            random.shuffle(self.indices)
        return iter(self.indices)

    def __len__(self) -> int:
        """
        Returns the number of sampled indices.

        :return: The number of sampled indices.
        :rtype: int
        """
        return len(self.indices)

In [None]:
def create_upsample_dataloader(old_dataset, batch_size, error_indices, E):
  indices = list(range(len(old_dataset))) + E * error_indices
  copy_old = copy.deepcopy(old_dataset)
  loader = DataLoader(copy_old,batch_size, sampler = CustomIndicesSampler(indices,True))
  return loader

In [None]:
error_indices = generate_upsample_indices(model,train_loader)
print(len(error_indices))
upsampled_loader = create_upsample_dataloader(trainset, 8, error_indices, 5)


Evaluating 79: 100%|██████████| 80/80 [02:09<00:00,  1.62s/batch]

20





In [None]:
optim = Adam(second_model.parameters(), lr = 1e-5)
new_model = train(second_model, 4, upsampled_loader, dev_loader, optim, scheduler, device)

Start Training:


Epoch 0: 100%|██████████| 93/93 [08:46<00:00,  5.66s/batch, loss=1.25]


Evaluation:


Epoch 0: 100%|██████████| 4/4 [00:02<00:00,  1.45batch/s]


accuracy= 0.656
Start Training:


Epoch 1: 100%|██████████| 93/93 [08:48<00:00,  5.68s/batch, loss=0.762]


Evaluation:


Epoch 1: 100%|██████████| 4/4 [00:02<00:00,  1.46batch/s]


accuracy= 0.750
Start Training:


Epoch 2: 100%|██████████| 93/93 [08:45<00:00,  5.65s/batch, loss=0.637]


Evaluation:


Epoch 2: 100%|██████████| 4/4 [00:02<00:00,  1.47batch/s]


accuracy= 0.781
Start Training:


Epoch 3: 100%|██████████| 93/93 [08:47<00:00,  5.68s/batch, loss=0.0413]


Evaluation:


Epoch 3: 100%|██████████| 4/4 [00:02<00:00,  1.46batch/s]

accuracy= 0.812





In [None]:
get_accuracy(second_model, test_loader)

Evaluating 7: 100%|██████████| 8/8 [00:14<00:00,  1.87s/batch]


0.71875

In [None]:
!pip install spuco

Collecting spuco
  Downloading spuco-1.0.3-py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.0/101.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting wilds>=2.0.0 (from spuco)
  Downloading wilds-2.0.0-py3-none-any.whl (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.2/126.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting ogb>=1.2.6 (from wilds>=2.0.0->spuco)
  Downloading ogb-1.3.6-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.8/78.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting outdated>=0.2.0 (from wilds>=2.0.0->spuco)
  Downloading outdated-0.2.2-py2.py3-none-any.whl (7.5 kB)
Collecting littleutils (from outdated>=0.2.0->wilds>=2.0.0->spuco)
  Downloading littleutils-0.2.2.tar.gz (6.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: littleutils
  Building wheel for littleutils (setup.p

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
spare_dataset = copy.deepcopy(trainset)
#once we copy spare, we need to create a dataloader that loads it in order?
spare_loader = DataLoader(spare_dataset, batch_size = 16, shuffle = False)
model.eval()

Z = None
Labels = []
Indices = []
for batch, index in spare_loader:
  with torch.no_grad():
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    z = model(input_ids,attention_mask = attention_mask, labels = labels).logits
    if Z is None:
      Z = z.detach().cpu()
    else:
      Z = torch.cat((Z,z.detach().cpu()),dim=0)
    Labels+= labels.detach().cpu().tolist()
    Indices+=index.detach().cpu().tolist()

print(Z, Labels, Indices)


tensor([[ 0.2073, -0.7216,  0.1331,  0.2109,  0.4315,  0.3542],
        [-0.4045, -0.1919,  0.7417,  0.7552,  0.0505,  0.0390],
        [ 0.0232, -0.7482,  0.0595,  0.1530,  0.3871,  0.0059],
        [-0.5104, -0.1562,  0.6733,  0.6559,  0.0520, -0.0344],
        [ 0.2740, -0.5068, -0.1198,  0.4479,  0.5681,  0.3915],
        [-0.5155, -0.2145,  0.5926,  0.7781,  0.2159,  0.1817],
        [-0.4585, -0.1683,  0.7380,  0.6717,  0.0211,  0.0015],
        [-0.4613, -0.1762,  0.8322,  0.7357,  0.1971,  0.1530],
        [-0.5454, -0.2049,  0.8005,  0.6772,  0.1704,  0.1649],
        [ 0.2637, -0.5473, -0.2781,  0.4169,  0.2832,  0.4275],
        [ 0.2626, -0.3206, -0.0763,  0.3569,  0.2668,  0.3624],
        [-0.4774, -0.1567,  0.7663,  0.7220,  0.1294,  0.1262],
        [-0.0163, -0.6214, -0.2035,  0.1390,  0.4236,  0.3345],
        [-0.5408, -0.2981,  0.7450,  0.6613,  0.1502,  0.2161],
        [-0.4010, -0.2105,  0.7040,  0.7012,  0.0964, -0.0044],
        [ 0.1774, -0.2923, -0.1513,  0.2

In [None]:
from spuco.group_inference import SpareInference
from spuco.group_inference.cluster import ClusterAlg
inferer = SpareInference(Z= Z, class_labels = Labels, cluster_alg= ClusterAlg.KMEDOIDS, max_clusters = 20)

TypeError: ignored

In [None]:





inputs = tokenizer(test_df['text'].iloc[0], return_tensors="pt")
print(test_df['text'].iloc[0])
import torch
with torch.no_grad():
    logits = model(**inputs).logits
    print(torch.argmax(logits))
print(test_df['dialect'].iloc[0])