In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/xcvdsfgfdg/preprocessed_augmented_dataset.csv
/kaggle/input/subtask1triantestdata/train.csv
/kaggle/input/subtask1triantestdata/test.csv


In [2]:
df=pd.read_csv('/kaggle/input/xcvdsfgfdg/preprocessed_augmented_dataset.csv')

In [3]:
!pip install pandas emoji nltk




In [4]:
import pandas as pd
import re
import emoji
import nltk

# Optional: if using any NLTK tools
nltk.download('wordnet')

# Slang dictionary (English only; for Arabic/Urdu you may add equivalents if needed)
slang_dict = {
    'thnx': 'thanks',
    'plz': 'please',
    'u': 'you',
    'ur': 'your',
    'b4': 'before',
    'gr8': 'great',
    'l8r': 'later',
    'bcoz': 'because',
    'omg': 'oh my god',
    'btw': 'by the way',
    # Add more if needed
}

def decode_slang(text):
    words = text.split()
    return ' '.join([slang_dict.get(word, word) for word in words])

# Optional: normalize Arabic (remove diacritics, unify characters)
def normalize_arabic(text):
    text = re.sub(r'[\u0617-\u061A\u064B-\u0652]', '', text)  # remove tashkeel
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    text = re.sub(r'ة', 'ه', text)
    return text

def preprocess_text(text):
    # 1. Remove URLs, mentions, and HTML tags
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"@\w+", "", text)

    # 2. Remove punctuation (keep Arabic letters and emojis)
    text = re.sub(r"[^\w\s\u0600-\u06FF]", "", text)

    # 3. Lowercase (only for non-Arabic)
    text = text.lower()

    # 4. Remove digits
    text = re.sub(r"\d+", "", text)

    # 5. Emoji to text
    text = emoji.demojize(text, delimiters=(" ", " "))

    # 6. Slang decoding
    text = decode_slang(text)

    # 7. Normalize Arabic
    text = normalize_arabic(text)

    return text.strip()

# Apply to the DataFrame
df['text'] = df['text'].apply(preprocess_text)

# 8. Remove duplicates and short tweets (<20 characters)
df = df.drop_duplicates(subset='text')
df = df[df['text'].str.len() >= 20]

print(df.head())


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


     id                                               text           label
0  8167                          لانني احب الاشياء الراقيه  not_applicable
1  1532                     اثق قدرتي علي التعامل الصعوبات            hope
2  4710  وروضه طل الغيث ينسجها حتي نسجت اضحي يدبجها تنف...  not_applicable
3  6084                    اشعر بقلبي العنيد يحارب الله بع  not_applicable
4  8968  بتحسسني اني مرتضي ده بيدي محتاج صدقه الشارع فا...  not_applicable


In [5]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

# Load and prepare data
def prepare_data(csv_path):
    df = pd.read_csv(csv_path)
    # df = df[:50]  # Use only 50 samples for testing
    
    print(f"Dataset shape: {df.shape}")
    print(f"Label distribution:\n{df['label'].value_counts()}")
    
    # Check if we have enough samples for stratification
    label_counts = df['label'].value_counts()
    if label_counts.min() < 2:
        print("Warning: Some labels have less than 2 samples. Cannot stratify.")
        # Encode labels
        label_encoder = LabelEncoder()
        df['encoded_labels'] = label_encoder.fit_transform(df['label'])
        
        # Split without stratification
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            df['text'].tolist(), 
            df['encoded_labels'].tolist(), 
            test_size=0.2, 
            random_state=42
        )
    else:
        # Encode labels
        label_encoder = LabelEncoder()
        df['encoded_labels'] = label_encoder.fit_transform(df['label'])
        
        # Split data with stratification
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            df['text'].tolist(), 
            df['encoded_labels'].tolist(), 
            test_size=0.2, 
            random_state=42,
            stratify=df['encoded_labels']
        )
    
    print(f"Train samples: {len(train_texts)}, Val samples: {len(val_texts)}")
    return train_texts, val_texts, train_labels, val_labels, label_encoder

# Dataset class
class ArabicTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Compute metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = (predictions == labels).mean()
    return {'accuracy': accuracy}

# Main training function
def finetune_model(csv_path, model_name="aubmindlab/bert-base-arabertv02-twitter"):
    print("Starting model finetuning...")
    print(f"Using AraBERT v02 Twitter model: {model_name}")
    
    # Prepare data
    train_texts, val_texts, train_labels, val_labels, label_encoder = prepare_data(csv_path)
    
    print(f"Loading model: {model_name}")
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # AraBERT tokenizer should have pad_token, but let's ensure it's set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else tokenizer.unk_token
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=len(label_encoder.classes_)
    )
    
    print(f"Model loaded with {len(label_encoder.classes_)} output labels")
    print(f"Label classes: {list(label_encoder.classes_)}")
    
    print("Creating datasets...")
    # Create datasets - using 256 max_length which is good for Twitter-like content
    train_dataset = ArabicTextDataset(train_texts, train_labels, tokenizer, max_length=256)
    val_dataset = ArabicTextDataset(val_texts, val_labels, tokenizer, max_length=256)
    
    # Training arguments - optimized for AraBERT
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=32,  # Increased epochs for better performance
        per_device_train_batch_size=8,  # Slightly larger batch size
        per_device_eval_batch_size=8,
        warmup_steps=100,
        weight_decay=0.01,
        learning_rate=2e-5,  # Optimal learning rate for BERT models
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        report_to="none",
        dataloader_num_workers=0,
        remove_unused_columns=True,
        save_total_limit=2,  # Keep only best 2 checkpoints
        fp16=torch.cuda.is_available(),  # Use mixed precision if CUDA available
    )
    
    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )
    
    print("Starting training...")
    # Train the model
    trainer.train()
    
    print("Training completed! Saving model...")
    # Save the model
    model.save_pretrained('./finetuned_arabertv02_twitter')
    tokenizer.save_pretrained('./finetuned_arabertv02_twitter')
    
    # Save label encoder
    import pickle
    with open('./finetuned_arabertv02_twitter/label_encoder.pkl', 'wb') as f:
        pickle.dump(label_encoder, f)
    
    print("Model training completed and saved to ./finetuned_arabertv02_twitter/")
    
    return model, tokenizer, label_encoder

# Prediction function
def predict_text(text, model_path='./finetuned_arabertv02_twitter'):
    import pickle
    
    print(f"Loading model from: {model_path}")
    
    # Load model, tokenizer, and label encoder
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    
    with open(f'{model_path}/label_encoder.pkl', 'rb') as f:
        label_encoder = pickle.load(f)
    
    # Set model to evaluation mode
    model.eval()
    
    # Tokenize input text
    inputs = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=256,
        return_tensors='pt'
    )
    
    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class_id = predictions.argmax().item()
    
    # Decode prediction
    predicted_label = label_encoder.inverse_transform([predicted_class_id])[0]
    confidence = predictions[0][predicted_class_id].item()
    
    return predicted_label, confidence

# Batch prediction function for multiple texts
def predict_batch(texts, model_path='./finetuned_arabertv02_twitter', batch_size=8):
    import pickle
    
    # Load model, tokenizer, and label encoder
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    
    with open(f'{model_path}/label_encoder.pkl', 'rb') as f:
        label_encoder = pickle.load(f)
    
    model.eval()
    
    predictions = []
    confidences = []
    
    # Process in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize batch
        inputs = tokenizer(
            batch_texts,
            truncation=True,
            padding='max_length',
            max_length=256,
            return_tensors='pt'
        )
        
        # Make predictions
        with torch.no_grad():
            outputs = model(**inputs)
            batch_predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_class_ids = batch_predictions.argmax(dim=-1).tolist()
            batch_confidences = batch_predictions.max(dim=-1)[0].tolist()
        
        # Decode predictions
        batch_labels = label_encoder.inverse_transform(predicted_class_ids)
        predictions.extend(batch_labels)
        confidences.extend(batch_confidences)
    
    return predictions, confidences

# Usage example:
if __name__ == "__main__":
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Finetune the model
    model, tokenizer, label_encoder = finetune_model('/kaggle/input/xcvdsfgfdg/preprocessed_augmented_dataset.csv')
    
    # Example predictions with more Arabic texts
    test_texts = [
        "أشعر بالأمل والتفاؤل والسعادة",
        "أكره هذا الشيء بشدة", 
        "الطقس جميل اليوم والجو رائع",
        "هذا منتج ممتاز أنصح به",
        "خدمة سيئة جداً ولا أنصح بها",
        "رأي محايد حول هذا الموضوع"
    ]
    
    print("\nMaking individual predictions...")
    for text in test_texts:
        prediction, confidence = predict_text(text)
        print(f"Text: {text}")
        print(f"Prediction: {prediction} (Confidence: {confidence:.4f})")
        print("-" * 50)
    
    print("\nMaking batch predictions...")
    batch_predictions, batch_confidences = predict_batch(test_texts)
    for text, pred, conf in zip(test_texts, batch_predictions, batch_confidences):
        print(f"Text: {text}")
        print(f"Prediction: {pred} (Confidence: {conf:.4f})")
        print("-" * 50)

2025-07-29 17:33:59.657388: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753810439.886676      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753810439.952461      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda
Starting model finetuning...
Using AraBERT v02 Twitter model: aubmindlab/bert-base-arabertv02-twitter
Dataset shape: (11074, 3)
Label distribution:
label
hate              3697
not_applicable    3695
hope              3682
Name: count, dtype: int64
Train samples: 8859, Val samples: 2215
Loading model: aubmindlab/bert-base-arabertv02-twitter


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded with 3 output labels
Label classes: ['hate', 'hope', 'not_applicable']
Creating datasets...
Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.678,0.60612,0.735892
2,0.5336,0.620965,0.736795
3,0.3385,0.86147,0.731828
4,0.0798,1.32588,0.737246
5,0.1206,1.559854,0.734537
6,0.1212,1.810394,0.729571
7,0.347,1.785149,0.727314
8,0.1761,1.881962,0.734537
9,0.1129,1.792301,0.741761
10,0.1468,2.073692,0.731828


Training completed! Saving model...
Model training completed and saved to ./finetuned_arabertv02_twitter/

Making individual predictions...
Loading model from: ./finetuned_arabertv02_twitter
Text: أشعر بالأمل والتفاؤل والسعادة
Prediction: hope (Confidence: 0.9997)
--------------------------------------------------
Loading model from: ./finetuned_arabertv02_twitter
Text: أكره هذا الشيء بشدة
Prediction: not_applicable (Confidence: 0.9996)
--------------------------------------------------
Loading model from: ./finetuned_arabertv02_twitter
Text: الطقس جميل اليوم والجو رائع
Prediction: hate (Confidence: 0.9991)
--------------------------------------------------
Loading model from: ./finetuned_arabertv02_twitter
Text: هذا منتج ممتاز أنصح به
Prediction: hope (Confidence: 0.8872)
--------------------------------------------------
Loading model from: ./finetuned_arabertv02_twitter
Text: خدمة سيئة جداً ولا أنصح بها
Prediction: not_applicable (Confidence: 0.9663)
--------------------------------

In [6]:
df=pd.read_csv('/kaggle/input/subtask1triantestdata/test.csv')

In [7]:
# import pandas as pd
# import re
# import emoji
# import nltk

# # Optional: if using any NLTK tools
# nltk.download('wordnet')

# # Slang dictionary (English only; for Arabic/Urdu you may add equivalents if needed)
# slang_dict = {
#     'thnx': 'thanks',
#     'plz': 'please',
#     'u': 'you',
#     'ur': 'your',
#     'b4': 'before',
#     'gr8': 'great',
#     'l8r': 'later',
#     'bcoz': 'because',
#     'omg': 'oh my god',
#     'btw': 'by the way',
#     # Add more if needed
# }

# def decode_slang(text):
#     words = text.split()
#     return ' '.join([slang_dict.get(word, word) for word in words])

# # Optional: normalize Arabic (remove diacritics, unify characters)
# def normalize_arabic(text):
#     text = re.sub(r'[\u0617-\u061A\u064B-\u0652]', '', text)  # remove tashkeel
#     text = re.sub(r'[إأآا]', 'ا', text)
#     text = re.sub(r'ى', 'ي', text)
#     text = re.sub(r'ؤ', 'و', text)
#     text = re.sub(r'ئ', 'ي', text)
#     text = re.sub(r'ة', 'ه', text)
#     return text

# def preprocess_text(text):
#     # 1. Remove URLs, mentions, and HTML tags
#     text = re.sub(r"http\S+|www\S+|https\S+", "", text)
#     text = re.sub(r"<.*?>", "", text)
#     text = re.sub(r"@\w+", "", text)

#     # 2. Remove punctuation (keep Arabic letters and emojis)
#     text = re.sub(r"[^\w\s\u0600-\u06FF]", "", text)

#     # 3. Lowercase (only for non-Arabic)
#     text = text.lower()

#     # 4. Remove digits
#     text = re.sub(r"\d+", "", text)

#     # 5. Emoji to text
#     text = emoji.demojize(text, delimiters=(" ", " "))

#     # 6. Slang decoding
#     text = decode_slang(text)

#     # 7. Normalize Arabic
#     text = normalize_arabic(text)

#     return text.strip()

# # Apply to the DataFrame
# df['text'] = df['text'].apply(preprocess_text)

In [8]:

# df=df[:10]
ans=[]
for i in range(len(df)):
    text= df['text'][i]
    predicted,ss=predict_text(text)
    # print(predicted)
    ans.append({
        'id':df['id'][i],
        'prediction':predicted
    })
dd=pd.DataFrame(ans)
dd.to_csv('prediction.csv',index=False)    

Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned_arabertv02_twitter
Loading model from: ./finetuned

In [9]:
dd

Unnamed: 0,id,prediction
0,5813,not_applicable
1,5853,not_applicable
2,251,hate
3,7213,hope
4,6848,not_applicable
...,...,...
1472,1989,not_applicable
1473,4720,not_applicable
1474,761,hate
1475,5403,hope
