In [1]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModel, AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
import torch.optim.lr_scheduler as lr_scheduler
import copy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
id2label = {
    0: "MSA",
    1: "MGH",
    2: "EGY",
    3: "LEV",
    4: "IRQ",
    5: "GLF"

}
label2id = {
    "MSA":0,
    "MGH":1,
    "EGY":2,
    "LEV":3,
    "IRQ":4,
    "GLF":5
}


In [3]:
model = AutoModelForSequenceClassification.from_pretrained(
    'CAMeL-Lab/bert-base-arabic-camelbert-mix', num_labels=6, id2label=id2label, label2id=label2id
)
second_model = copy.deepcopy((model))
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-mix")


Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-mix were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassific

In [4]:
#Replace with your path
df = pd.read_csv('./full_cleaned_data.tsv',sep='\t')
##Dataset: follow the paradigm of the typical pytorch dataset
print(len(df))
grouped_df = df.groupby('split')
dfs = {name: group for name, group in grouped_df}
train_df = dfs['train']#.sample(n=100)
dev_df = dfs['dev']#.sample(n=32)
test_df = dfs['test']#.sample(n=64)

225068


In [5]:
import torch
class ArabicDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, label2id):
        self.encodings = tokenizer(dataframe['text'].values.tolist(),truncation=True, padding=True)
        self.labels = dataframe['dialect'].apply(lambda x: label2id[x]).values.tolist()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item, idx

    def __len__(self):
        return len(self.labels)

In [9]:
trainset = ArabicDataset(train_df, tokenizer,label2id)
train_loader = DataLoader(trainset, batch_size = 64, shuffle = True)
devset = ArabicDataset(dev_df, tokenizer, label2id)
dev_loader = DataLoader(devset, batch_size = 64, shuffle = True)
testset = ArabicDataset(test_df, tokenizer, label2id)
test_loader = DataLoader(testset, batch_size = 64, shuffle = True)

In [10]:
def train(model,epochs,train_loader,dev_loader,optimizer,lr_scheduler, device):
  model.to(device)
  for epoch in range(epochs):
    model.train()
    print("Start Training:")
    with tqdm(train_loader, unit="batch") as tepoch:
      for batch,index in tepoch:
        optimizer.zero_grad()
        tepoch.set_description(f"Epoch {epoch}")
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids,attention_mask = attention_mask, labels = labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        if lr_scheduler:
          lr_scheduler.step()
        tepoch.set_postfix(loss=loss.item())

    model.eval()
    print("Evaluation:")
    num_right = 0
    num_items = 0
    with tqdm(dev_loader, unit="batch") as depoch:
      for batch,index in depoch:
        depoch.set_description(f"Epoch {epoch}")
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
          output = model(input_ids,attention_mask)
          logits = output.logits
          predictions = torch.argmax(logits, dim = -1)
          correct_num = (predictions == labels).sum()
        num_right += correct_num
        num_items += len(batch['labels'])
      accuracy = num_right / num_items
      print("accuracy= %.3f" %(accuracy))




In [11]:
#Learning configure
optim = Adam(model.parameters(), lr = 1e-5)
scheduler = None
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
train(model, 4, train_loader, dev_loader, optim, scheduler, device)

cuda
Start Training:


Epoch 0:  69%|█████████████████████████████████████▍                | 2190/3162 [14:59<06:39,  2.43batch/s, loss=0.408]


KeyboardInterrupt: 

In [12]:
def get_accuracy(model, loader):
  num_items = 0
  num_correct = 0
  model.eval()
  step = 0
  with torch.no_grad():
    with tqdm(loader, unit="batch") as tepoch:
        for batch,index in tepoch:
          tepoch.set_description(f"Evaluating {step}")
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['labels'].to(device)
          outputs = model(input_ids,attention_mask = attention_mask)
          logits = outputs.logits
          predictions = torch.argmax(logits,dim = -1)
          right = (predictions == labels).sum()
          num = len(input_ids)
          num_items += num
          num_correct += right
          step += 1
    return (num_correct/num_items).item()

print(get_accuracy(model, test_loader))

Evaluating 128: 100%|█████████████████████████████████████████████████████████████| 129/129 [00:09<00:00, 12.93batch/s]


0.7994911670684814


In [13]:
def generate_upsample_indices(model, dataloader):
  model.eval()
  step = 0
  indices = []
  with torch.no_grad():
    with tqdm(dataloader, unit="batch") as tepoch:
        for batch,index in tepoch:
          tepoch.set_description(f"Evaluating {step}")
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['labels'].to(device)
          outputs = model(input_ids,attention_mask = attention_mask)
          logits = outputs.logits
          predictions = torch.argmax(logits,dim = -1)
          masks = (predictions != labels).cpu()
          wrong_indices = index[masks]
          indices+=wrong_indices.tolist()
          step +=1
  return indices

In [14]:
import random
from typing import Iterator, List
import numpy as np
from torch.utils.data import Sampler
class CustomIndicesSampler(Sampler[int]):
    """
    Samples from the specified indices (pass indices - upsampled, downsampled, group balanced etc. to this class)
    Default is no shuffle.
    """
    def __init__(
        self,
        indices: List[int],
        shuffle: bool = False,
    ):
        """
        Samples elements from the specified indices.

        :param indices: The list of indices to sample from.
        :type indices: list[int]
        :param shuffle: Whether to shuffle the indices. Default is False.
        :type shuffle: bool, optional
        """
        self.indices = indices
        self.shuffle = shuffle

    def __iter__(self) -> Iterator[int]:
        """
        Returns an iterator over the sampled indices.

        :return: An iterator over the sampled indices.
        :rtype: iterator[int]
        """
        if self.shuffle:
            random.shuffle(self.indices)
        return iter(self.indices)

    def __len__(self) -> int:
        """
        Returns the number of sampled indices.

        :return: The number of sampled indices.
        :rtype: int
        """
        return len(self.indices)

In [15]:
def create_upsample_dataloader(old_dataset, batch_size, error_indices, E):
  indices = list(range(len(old_dataset))) + E * error_indices
  copy_old = copy.deepcopy(old_dataset)
  loader = DataLoader(copy_old,batch_size, sampler = CustomIndicesSampler(indices,True))
  return loader

In [16]:
error_indices = generate_upsample_indices(model,train_loader)
print(len(error_indices))
upsampled_loader = create_upsample_dataloader(trainset, 8, error_indices, 5)


Evaluating 208:   7%|███▉                                                        | 208/3162 [00:31<07:25,  6.63batch/s]


KeyboardInterrupt: 

In [None]:
optim = Adam(second_model.parameters(), lr = 1e-5)
new_model = train(second_model, 4, upsampled_loader, dev_loader, optim, scheduler, device)

In [None]:
get_accuracy(second_model, test_loader)

In [None]:
!pip install spuco

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
spare_dataset = copy.deepcopy(trainset)
#once we copy spare, we need to create a dataloader that loads it in order?
spare_loader = DataLoader(spare_dataset, batch_size = 16, shuffle = False)
model.eval()

Z = None
Labels = []
Indices = []
for batch, index in spare_loader:
  with torch.no_grad():
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    z = model(input_ids,attention_mask = attention_mask, labels = labels).logits
    if Z is None:
      Z = z.detach().cpu()
    else:
      Z = torch.cat((Z,z.detach().cpu()),dim=0)
    Labels+= labels.detach().cpu().tolist()
    Indices+=index.detach().cpu().tolist()

print(Z, Labels, Indices)


In [None]:
from spuco.group_inference import SpareInference
from spuco.group_inference.cluster import ClusterAlg
inferer = SpareInference(Z= Z, class_labels = Labels, cluster_alg= ClusterAlg.KMEDOIDS, max_clusters = 20)

In [None]:





inputs = tokenizer(test_df['text'].iloc[0], return_tensors="pt")
print(test_df['text'].iloc[0])
import torch
with torch.no_grad():
    logits = model(**inputs).logits
    print(torch.argmax(logits))
print(test_df['dialect'].iloc[0])