In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!pip install transformers

In [None]:
# !cp /content/drive/MyDrive/home/datasets/train_processed.csv ./
# !cp /content/drive/MyDrive/home/datasets/dev_processed.csv ./
# !cp /content/drive/MyDrive/home/datasets/few_shot_processed.csv ./

In [None]:
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np
from torch import nn
from torch.optim import Adam
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
base_model_name = 'moha/arabert_c19'
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModel.from_pretrained(base_model_name).to(device)
for param in base_model.parameters():
  param.requires_grad = False

In [None]:
train_data = pd.read_csv('train_processed.csv')
valid_data = pd.read_csv('dev_processed.csv')
few_shot_data = pd.read_csv('few_shot_processed.csv')


In [None]:
labels_list = train_data.category.unique()
labels = {k: v for v, k in enumerate(labels_list)}
print(labels)

In [None]:
class Dataset(torch.utils.data.Dataset):

  def __init__(self, df):
    #Tokenize the text
    self.texts = [
        tokenizer(text,
                  padding='max_length',
                  max_length=512,
                  truncation=True,
                  return_tensors="pt") for text in df['text']
    ]
    #Get the contextualized embeddings
    self.texts = [(base_model(input_ids=text['input_ids'].to(device),
                              attention_mask=text['attention_mask'].to(device),
                              return_dict=False)[1]).detach().cpu()
                  for text in self.texts]
    #Get the labels
    self.labels = [labels[label] for label in df['category']]
    self.labels = torch.tensor(self.labels, dtype=torch.long)

  def classes(self):
    return self.labels

  def __len__(self):
    return len(self.labels)

  def get_batch_labels(self, idx):
    return self.labels[idx]

  def get_batch_texts(self, idx):
    return self.texts[idx]

  def __getitem__(self, idx):
    batch_texts = self.get_batch_texts(idx)
    batch_y = self.get_batch_labels(idx)
    return batch_texts, batch_y

In [None]:
train_dataset = Dataset(train_data)
valid_dataset = Dataset(valid_data)
few_shot_dataset = Dataset(few_shot_data)

In [None]:
class ContextualClassifier(nn.Module):

  def __init__(self, layer_1=768, layer_2=384, output=10):
    super(ContextualClassifier, self).__init__()
    self.linear_1 = nn.Linear(layer_1, layer_2)
    self.linear_2 = nn.Linear(layer_2, output)
    self.relu = nn.ReLU()

  def forward(self, contextualized_embeddings):
    linear_1_output = self.linear_1(contextualized_embeddings)
    linear_2_output = self.linear_2(self.relu(linear_1_output))
    output_layer = self.relu(linear_2_output)
    return output_layer


## Prepare the model

In [None]:
model = ContextualClassifier().to(device)
# model.load_state_dict(torch.load('/content/drive/MyDrive/home/models/ContextualClassifier.pt'))

## Train the model

In [None]:
TRAIN_BATCH_SIZE = 64
EPOCHS = 10
LR = 5e-5

#train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)#True Training
train_loader = torch.utils.data.DataLoader(few_shot_dataset,
                                           batch_size=TRAIN_BATCH_SIZE,
                                           shuffle=True)  #Few Shot
valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                           batch_size=1024,
                                           shuffle=True)

criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=LR)
criterion.to(device)

for epoch_num in tqdm(range(EPOCHS)):
  total_acc_train = 0
  total_loss_train = 0
  sub_epoch_counter = 0
  sample_batch_loss = 0
  sample_batch_acc = 0
  for train_input, train_label in train_loader:
    model.zero_grad()

    train_input = train_input.to(device)
    train_label = train_label.to(device)
    output = model(train_input)
    output = output.squeeze(1)
    batch_loss = criterion(output, train_label)

    total_loss_train += batch_loss.item()
    sample_batch_loss += batch_loss.item()
    acc = (output.argmax(dim=1) == train_label).sum().item()
    total_acc_train += acc
    sample_batch_acc += acc
    sub_epoch_counter += 1

    batch_loss.backward()
    optimizer.step()
    # Sub Epoch evaluation
    # if sub_epoch_counter % 200 == 0:
    #   with torch.no_grad():
    #     sample = next(iter(valid_loader))# Draw a Sample from the validation set to get an estimate evaluation of the model
    #     output = model(sample[0].to(device)).cpu()
    #     output = output.squeeze(1)
    #     accuracy = accuracy_score(sample[1], output.argmax(dim=1))
    #     f1 = f1_score(sample[1], output.argmax(dim=1), average='macro')
    #     print()
    #     print(
    #         f'Epoch: {epoch_num + 1} \
    #         | Train Loss: {sample_batch_loss / (sub_epoch_counter* TRAIN_BATCH_SIZE): .3f} \
    #         | Train Accuracy: {sample_batch_acc / (sub_epoch_counter* TRAIN_BATCH_SIZE): .3f} \
    #         | Dev Accuracy: {accuracy: .3f} \
    #         | Dev F1: {f1: .3f}')
    #     sub_epoch_counter = 0
    #     sample_batch_loss = 0
    #     sample_batch_acc = 0

    # Epoch Evaluation
    # with torch.no_grad():
    #   accuracy = accuracy_score(val_data.category, model.predict(val_data.text))
    #   f1 = f1_score(val_data.category, model.predict(val_data.text), average='macro')
    #   print( f'Total Epoch: {epoch_num + 1} \
    #          | Train Loss: {total_loss_train / len(train_data): .3f} \
    #          | Train Accuracy: {total_acc_train / len(train_data): .3f} \
    #          | Dev Accuracy: {accuracy: .3f} \
    #          | Dev F1: {f1: .3f}')
    #   print("________________________________________________________________________________________")

In [None]:
output_labels = []
valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                           batch_size=1024,
                                           shuffle=False)

with torch.no_grad():
  for valid_input, valid_label in tqdm(valid_loader):
    valid_input = valid_input.to(device)
    valid_label = valid_label.to(device)
    output = model(valid_input).cpu()
    output = output.squeeze(1)
    output_labels.extend(output.argmax(dim=1).tolist())

print()
F1 = f1_score(valid_dataset.classes(), output_labels, average='macro')
Report = classification_report(valid_dataset.classes(),
                               output_labels,
                               zero_division=0)
print(Report)


In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/home/models/ContextualClassifier.pt')