In [1]:
import pandas as pd
import torch
import os
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, pipeline
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from flask import Flask, request, jsonify
import re

In [2]:
def clean_text(text):
    text = str(text)  
    text = text.lower()  
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  
    text = re.sub(r"@\w+|#", '', text)  
    text = re.sub(r"[^\w\s]", '', text) 
    text = re.sub(r"\d+", '', text)  
    text = re.sub(r"\s+", ' ', text).strip()  
    return text

In [3]:
import pandas as pd

def load_data(file_path, lang):
    df = pd.read_csv(file_path)

    if lang == 'ar':
        text_col, label_col = 'review_description', 'rating'  
    else:
        text_col, label_col = 'Summary', 'Sentiment' 

    if label_col in df.columns:
        label_mapping = {'positive': 1, 'negative': 0, 'neutral': 2}  
        if df[label_col].dtype == object:  
            df[label_col] = df[label_col].map(label_mapping)

    df = df[[text_col, label_col]].dropna()

    df[text_col] = df[text_col].apply(clean_text)

    return df.rename(columns={text_col: 'text', label_col: 'label'})


In [4]:
def tokenize_function(examples, tokenizer):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

In [5]:
def fine_tune_model(data_file, model_name, lang, output_dir):
    df = load_data(data_file, lang)  
    # df = df[:1000]  
    df = load_data(data_file, lang)
    train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
    
    train_dataset = Dataset.from_dict({'text': train_texts.tolist(), 'label': train_labels.tolist()})
    val_dataset = Dataset.from_dict({'text': val_texts.tolist(), 'label': val_labels.tolist()})
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    train_dataset = train_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
    val_dataset = val_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
    
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy='epoch',
        save_strategy='epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        save_total_limit=2
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    import traceback
    try:
        trainer.train()
    except Exception as e:
        print("Error:", e)
    traceback.print_exc()

    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    return trainer.evaluate()

In [6]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/pdatasets1/Final_Data.csv
/kaggle/input/pdatasets1/Dataset-SA.csv


In [7]:
# Fine-tuning models
import wandb
wandb.login(key="0c98741e2c5633554723d3bc3e0b466aa6c08f2f") 
arabic_results = fine_tune_model('/kaggle/input/pdatasets1/Final_Data.csv', 'aubmindlab/bert-base-arabertv02', 'ar', '/kaggle/working/arabic_model')
english_results = fine_tune_model('/kaggle/input/pdatasets1/Dataset-SA.csv', 'bert-base-uncased', 'en', '/kaggle/working/english_model')

print("Arabic Model Accuracy:", arabic_results['eval_accuracy'])
print("English Model Accuracy:", english_results['eval_accuracy'])

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhabibaahmad2255[0m ([33mhabibaahmad2255-fayoum-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


tokenizer_config.json:   0%|          | 0.00/381 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/825k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/32036 [00:00<?, ? examples/s]

Map:   0%|          | 0/8009 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4423,0.419993,0.861656


NoneType: None


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/164032 [00:00<?, ? examples/s]

Map:   0%|          | 0/41009 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1124,0.16486,0.953205


NoneType: None


Arabic Model Accuracy: 0.8616556374079161
English Model Accuracy: 0.953205393937916


In [8]:
!zip -r /kaggle/working/arabic_model.zip /kaggle/working/arabic_model
!zip -r /kaggle/working/english_model.zip /kaggle/working/english_model

  adding: kaggle/working/arabic_model/ (stored 0%)
  adding: kaggle/working/arabic_model/checkpoint-4005/ (stored 0%)
  adding: kaggle/working/arabic_model/checkpoint-4005/tokenizer.json (deflated 74%)
  adding: kaggle/working/arabic_model/checkpoint-4005/config.json (deflated 51%)
  adding: kaggle/working/arabic_model/checkpoint-4005/scheduler.pt (deflated 56%)
  adding: kaggle/working/arabic_model/checkpoint-4005/model.safetensors (deflated 7%)
  adding: kaggle/working/arabic_model/checkpoint-4005/tokenizer_config.json (deflated 78%)
  adding: kaggle/working/arabic_model/checkpoint-4005/optimizer.pt (deflated 33%)
  adding: kaggle/working/arabic_model/checkpoint-4005/special_tokens_map.json (deflated 80%)
  adding: kaggle/working/arabic_model/checkpoint-4005/rng_state.pth (deflated 25%)
  adding: kaggle/working/arabic_model/checkpoint-4005/trainer_state.json (deflated 82%)
  adding: kaggle/working/arabic_model/checkpoint-4005/training_args.bin (deflated 52%)
  adding: kaggle/working/

In [None]:
# Flask App
app = Flask(__name__)

def load_pipeline(model_path):
    return pipeline("text-classification", model=model_path, tokenizer=model_path)

arabic_classifier = load_pipeline("/kaggle/working/arabic_model")
english_classifier = load_pipeline("/kaggle/working/english_model")

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    text = data.get("text", "")
    lang = data.get("lang", "en")
    
    if lang == 'ar':
        result = arabic_classifier(text)
    else:
        result = english_classifier(text)
    
    return jsonify(result)

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)


Device set to use cuda:0
Device set to use cuda:0


 * Serving Flask app '__main__'
 * Debug mode: off


In [None]:
import os
print(os.listdir("/kaggle/working"))


In [None]:
!nvidia-smi
