In [1]:
!pip install transformers datasets torch

Defaulting to user installation because normal site-packages is not writeable


    tensorflow (>=2requests>=2)
               ~~~~~^


In [2]:
import pandas as pd

df = pd.read_parquet("hf://datasets/hadyelsahar/ar_res_reviews/data/train-00000-of-00001.parquet")

In [3]:
df.columns

Index(['polarity', 'text', 'restaurant_id', 'user_id'], dtype='object')

In [4]:
df.head()

Unnamed: 0,polarity,text,restaurant_id,user_id
0,0,اولا: المنيو تغير الشورما اصبحت اعتياديه بأختف...,296,423
1,0,من محلات الشاورما ذات الشعبيه لتميز الصلصات ال...,296,423
2,1,دجاج طازج يحضر امامك على الطلب لا يقوم باعدة ا...,5027,39580
3,1,فكما تعرف أستراليا بالكنغر والكوالا. فإنها تعر...,642,444
4,0,إسمحو لي أن أقيم مطعم هاشم بصفتي فلسطيني عشت ف...,434,2191


In [5]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabert")
model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabert", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from sklearn.model_selection import train_test_split
X = df['text'].tolist()  
y = df['polarity'].tolist()  
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128)
train_encodings = tokenize_function(X_train)
val_encodings = tokenize_function(X_val)


In [8]:
import torch

class MakeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MakeDataset(train_encodings, y_train)
val_dataset = MakeDataset(val_encodings, y_val)


In [9]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,

)
trainer.train()







  0%|          | 0/1257 [00:00<?, ?it/s]

{'loss': 1.2493, 'grad_norm': 6.369697570800781, 'learning_rate': 0.0019840891010342084, 'epoch': 0.02}
{'loss': 0.6561, 'grad_norm': 12.315391540527344, 'learning_rate': 0.001968178202068417, 'epoch': 0.05}
{'loss': 0.7995, 'grad_norm': 7.834841251373291, 'learning_rate': 0.0019522673031026252, 'epoch': 0.07}
{'loss': 0.5985, 'grad_norm': 5.721120357513428, 'learning_rate': 0.0019363564041368338, 'epoch': 0.1}
{'loss': 0.8302, 'grad_norm': 4.700566291809082, 'learning_rate': 0.0019204455051710422, 'epoch': 0.12}
{'loss': 0.7801, 'grad_norm': 11.724882125854492, 'learning_rate': 0.0019045346062052508, 'epoch': 0.14}
{'loss': 0.6797, 'grad_norm': 10.362717628479004, 'learning_rate': 0.001888623707239459, 'epoch': 0.17}
{'loss': 0.655, 'grad_norm': 7.186372756958008, 'learning_rate': 0.0018727128082736675, 'epoch': 0.19}
{'loss': 0.5975, 'grad_norm': 1.5870336294174194, 'learning_rate': 0.001856801909307876, 'epoch': 0.21}
{'loss': 0.699, 'grad_norm': 12.85105037689209, 'learning_rate': 

  0%|          | 0/105 [00:00<?, ?it/s]

{'eval_loss': 0.6109545230865479, 'eval_runtime': 191.1556, 'eval_samples_per_second': 8.752, 'eval_steps_per_second': 0.549, 'epoch': 1.0}
{'loss': 0.6293, 'grad_norm': 6.0464653968811035, 'learning_rate': 0.0013317422434367542, 'epoch': 1.0}
{'loss': 0.6748, 'grad_norm': 11.502659797668457, 'learning_rate': 0.0013158313444709626, 'epoch': 1.03}
{'loss': 0.7037, 'grad_norm': 20.035144805908203, 'learning_rate': 0.001299920445505171, 'epoch': 1.05}
{'loss': 0.7578, 'grad_norm': 4.402806282043457, 'learning_rate': 0.0012840095465393795, 'epoch': 1.07}
{'loss': 0.7892, 'grad_norm': 5.4321441650390625, 'learning_rate': 0.001268098647573588, 'epoch': 1.1}
{'loss': 0.576, 'grad_norm': 1.8205862045288086, 'learning_rate': 0.0012521877486077963, 'epoch': 1.12}
{'loss': 0.6237, 'grad_norm': 8.388557434082031, 'learning_rate': 0.0012362768496420047, 'epoch': 1.15}
{'loss': 0.6071, 'grad_norm': 1.6107631921768188, 'learning_rate': 0.0012203659506762133, 'epoch': 1.17}
{'loss': 0.6552, 'grad_norm

KeyboardInterrupt: 

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.6122041344642639, 'eval_runtime': 10.7094, 'eval_samples_per_second': 156.218, 'eval_steps_per_second': 9.804, 'epoch': 3.0}


In [None]:
def predict_sentiment(text):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()

    return "positive" if prediction == 1 else "negative"
while True:
    text = input("Enter a text (or type 'exit' to stop): ")

    if text.lower() in ['exit', 'quit']:
        print("Exiting the prediction loop. Goodbye!")
        break  
    prediction = predict_sentiment(text)
    print(f'The text "{text}" is: {prediction}')


Enter a text (or type 'exit' to stop): الشاورما مررره زاكيه
The text "الشاورما مررره زاكيه" is: positive
Enter a text (or type 'exit' to stop): الشاورما ابو كلب
The text "الشاورما ابو كلب" is: positive
Enter a text (or type 'exit' to stop): الشاورما بنت كلب
The text "الشاورما بنت كلب" is: positive
Enter a text (or type 'exit' to stop): الشاورما سيئة
The text "الشاورما سيئة" is: positive
Enter a text (or type 'exit' to stop): سيئة
The text "سيئة" is: positive
Enter a text (or type 'exit' to stop):  المنيو تغير الشورما اصبحت اعتياديه
The text " المنيو تغير الشورما اصبحت اعتياديه" is: positive
Enter a text (or type 'exit' to stop): exit
Exiting the prediction loop. Goodbye!
