In [1]:
!pip install transformers 
!pip install datasets

In [2]:
! conda install -y gdown

In [3]:
# get the cleaned CSV from my drive
!gdown --id 1N-MFwrXg5Um6-nNq419GD1u5lUMI-PrA

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, AutoTokenizer
from datasets import load_metric
import numpy as np

In [5]:
df = pd.read_csv('./cleaned.csv', index_col=0)

In [6]:
# get rid of nans
df.dropna(inplace=True)

In [7]:
# convert the text column into list of strings
X = df['0'].to_list()

In [8]:
# one hot encode the labels
Y = pd.get_dummies(df['dialect']).values

In [9]:
# use 75% of the data due to resources and time
x_small, x_train_test, y_small, y_train_test = train_test_split(X, Y, test_size=.75, stratify=Y, random_state=42)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x_train_test, y_train_test, test_size=.1, stratify=y_train_test, random_state=42)

In [11]:
tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabertv02-twitter')

In [12]:
# adjust the max length to the longest encoded text
max_len = max([len(tokenizer.encode(sentence)) for sentence in df['0']])

In [13]:
# encode for x_train and x_test
train_encodings = tokenizer(x_train, truncation=True, padding='max_length', max_length=max_len)
test_encodings = tokenizer(x_test, truncation=True, padding='max_length', max_length=max_len)

In [14]:
# create dataset from encoded text and labels
import torch

class DilacetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = DilacetDataset(train_encodings, y_train.astype('float64'))
test_dataset = DilacetDataset(test_encodings, y_test.astype('float64'))

In [15]:
# free some ram
del train_encodings 
del test_encodings 
del x_train
del y_train

In [16]:
model = BertForSequenceClassification.from_pretrained('aubmindlab/bert-base-arabertv02-twitter', num_labels=18)

In [17]:
# calculate accuracy at every epoch
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    references = np.argmax(labels, axis=1)
    return metric.compute(predictions=predictions, references=references)

In [18]:
# kaggle thing to get rid of a message
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy = "epoch",     
    save_total_limit = 15,
    save_steps = 2000,
    load_best_model_at_end = True,

)


trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset ,          # evaluation dataset
     compute_metrics = compute_metrics    # evaluate accuracy
)

trainer.train()

In [None]:
# saving model to use for API
trainer.save_model('./trial7')

In [None]:
trainer.evaluate(test_dataset)

In [None]:
output=trainer.predict(test_dataset)[0]

In [None]:
from sklearn.metrics import classification_report

cm=classification_report(y_test.argmax(axis=1),output.argmax(axis=1))
print(cm)

In [None]:
print('tamaaaaam')