In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import os

In [None]:
!pip install -U transformers




In [None]:
import transformers
print(transformers.__version__)


4.51.3


In [None]:
# 1. Set up constants
MODEL_NAME_ = "GroNLP/mdebertav3-subjectivity-multilingual"
# MODEL_NAME = "nlptown/bert-base-multilingual-uncased-sentiment""
MAX_LENGTH = 128
BATCH_SIZE = 64
EPOCHS = 3
LEARNING_RATE = 3e-5

In [None]:
from google.colab import drive
import os
import pandas as pd

# Step 1: Mount Google Drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
save_dir = '/content/drive/MyDrive/saved_model'
# Load from the same path
model = AutoModelForSequenceClassification.from_pretrained(save_dir)
tokenizer = AutoTokenizer.from_pretrained(save_dir)

# Move model to correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("✅ Model and tokenizer successfully loaded from Drive")


In [None]:
import os
import pandas as pd
import shutil

def load_data(file_list, source_dir='/content/drive/MyDrive/data', target_dir='/content'):
    """
    Load TSV files from Google Drive and save them to a local directory in Colab.
    Each language has its own folder in the data directory.

    Args:
        file_list (list): List of TSV filenames to load (e.g., 'train_ar.tsv')
        source_dir (str): Base source directory in Google Drive
        target_dir (str): Target directory in Colab workspace

    Returns:
        dict: Dictionary of pandas DataFrames with filename as key
    """
    # Create target directory if it doesn't exist
    data_dir = os.path.join(target_dir, 'data')
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
        print(f"Created directory: {data_dir}")

    # Dictionary to store dataframes
    dataframes = {}

    # Language mapping from filename prefixes
    language_map = {
        'ar': 'arabic',
        'bg': 'bulgarian',
        'de': 'german',
        'en': 'english',
        'it': 'italian'
    }

    # Process each file in the list
    for filename in file_list:
        # Extract language code from filename (e.g., 'ar' from 'train_ar.tsv')
        # Format could be 'train_ar.tsv' or 'dev_ar.tsv' or 'arabic_train_augmented.tsv'
        if filename.startswith('train_') or filename.startswith('dev_'):
            lang_code = filename.split('_')[1].split('.')[0]
            language_folder = language_map.get(lang_code, lang_code)
        elif '_train_augmented' in filename:
            # Handle augmented filenames like 'arabic_train_augmented.tsv'
            language_folder = filename.split('_')[0]
        elif "_unlabeled" in filename:
            # Handle unlabeled filenames like 'test_ar_unlabeled.tsv
            lang_code = filename.split('_')[1]
            language_folder = language_map.get(lang_code, lang_code)
        else:
            # If format is unknown, just use the filename as is
            language_folder = filename.split('.')[0]

        # Create language folder in target directory if it doesn't exist
        lang_target_dir = os.path.join(data_dir, language_folder)
        if not os.path.exists(lang_target_dir):
            os.makedirs(lang_target_dir)
            print(f"Created directory: {lang_target_dir}")

        # Check for source file in the language-specific folder
        source_path = os.path.join(source_dir, language_folder, filename)
        target_path = os.path.join(lang_target_dir, filename)

        # Check if source file exists
        if os.path.exists(source_path):
            try:
                # Read the TSV file
                df = pd.read_csv(source_path, sep='\t')
                dataframes[filename] = df

                # Save to target location
                df.to_csv(target_path, sep='\t', index=False)
                print(f"✅ Successfully loaded and copied {filename} from {language_folder} folder")

            except Exception as e:
                print(f"❌ Error processing {filename} from {language_folder} folder: {e}")
        else:
            print(f"❌ Source file not found: {source_path}")

    return dataframes




In [None]:
def merge_augmented_data(train_dataframes, data_dir='/content/data'):
    """
    Merge augmented training data with original training data and update the original dictionary.

    Args:
        train_dataframes (dict): Dictionary of training dataframes (modifies in place)
        data_dir (str): Directory where files should be saved
    """
    augmentation_map = {
        'train_ar.tsv': 'arabic_train_augmented.tsv',
        'train_bg.tsv': 'bulgarian_train_augmented.tsv',
        # Add other language pairs as needed
    }

    language_map = {
        'train_ar.tsv': 'arabic',
        'train_bg.tsv': 'bulgarian',
        'train_de.tsv': 'german',
        'train_en.tsv': 'english',
        'train_it.tsv': 'italian'
    }

    for train_file, augmented_file in augmentation_map.items():
        if train_file in train_dataframes and augmented_file in train_dataframes:
            print(f"Merging {augmented_file} into {train_file}...")

            combined_df = pd.concat(
                [train_dataframes[train_file], train_dataframes[augmented_file]],
                ignore_index=True
            )

            # ✅ Overwrite the entry in the dictionary
            train_dataframes[train_file] = combined_df

            # Save the new combined file
            language_folder = language_map.get(train_file, train_file.split('_')[1].split('.')[0])
            output_path = os.path.join(data_dir, language_folder, train_file)
            combined_df.to_csv(output_path, sep='\t', index=False)

            print(f"✅ Successfully merged and saved {train_file}")
            print(f"   📏 Length after merge: {len(combined_df)}")


In [None]:


# Define file lists
train_files = [
    'train_ar.tsv',
    'train_bg.tsv',
    'train_de.tsv',
    'train_en.tsv',
    'train_it.tsv'
]

validation_files = [
    'dev_ar.tsv',
    'dev_bg.tsv',
    'dev_de.tsv',
    'dev_en.tsv',
    'dev_it.tsv'
]

# Also include augmented files if they exist
augmented_files = [
    'arabic_train_augmented.tsv',
    'bulgarian_train_augmented.tsv'
]


# Load all files
all_files = train_files + validation_files + augmented_files
data_folder = '/content/drive/MyDrive/data'
all_dataframes = load_data(all_files, source_dir=data_folder)

# Merge augmented data with original training data
merge_augmented_data(all_dataframes)

# Now you can access the training and validation data separately
train_dataframes = {filename: all_dataframes[filename] for filename in train_files if filename in all_dataframes}
val_dataframes = {filename: all_dataframes[filename] for filename in validation_files if filename in all_dataframes}


print("✅ All data processing complete!")

Created directory: /content/data
Created directory: /content/data/arabic
✅ Successfully loaded and copied train_ar.tsv from arabic folder
Created directory: /content/data/bulgarian
✅ Successfully loaded and copied train_bg.tsv from bulgarian folder
Created directory: /content/data/german
✅ Successfully loaded and copied train_de.tsv from german folder
Created directory: /content/data/english
✅ Successfully loaded and copied train_en.tsv from english folder
Created directory: /content/data/italian
✅ Successfully loaded and copied train_it.tsv from italian folder
✅ Successfully loaded and copied dev_ar.tsv from arabic folder
✅ Successfully loaded and copied dev_bg.tsv from bulgarian folder
✅ Successfully loaded and copied dev_de.tsv from german folder
✅ Successfully loaded and copied dev_en.tsv from english folder
✅ Successfully loaded and copied dev_it.tsv from italian folder
✅ Successfully loaded and copied arabic_train_augmented.tsv from arabic folder
✅ Successfully loaded and copied 

In [None]:
train_files = {filename: all_dataframes[filename] for filename in train_files if filename in all_dataframes}
for fname, df in train_files.items():
    print(f"{fname}: {df.shape}")
    print(df.head(2))

train_ar.tsv: (3721, 3)
          sentence_id                                           sentence label
0  AlMasryAlYoum_10_1  عندما تولى الرئيس عبد الفتاح السيسى السلطة فى ...   OBJ
1  MAH_62-curl_01_005  في هذا السياق يشرح عيراني أنّ الخوف لا يقتصر ف...   OBJ
train_bg.tsv: (1962, 3)
                            sentence_id  \
0  b678f74b-3981-4ad9-93b3-4c549605a02c   
1  ea65624a-da34-4bc4-8085-edb93d2e30e1   

                                            sentence label  
0  Учителите, за които цяла България разбра, са С...   OBJ  
1               А ако намерите каска е още по-добре.  SUBJ  
train_de.tsv: (800, 3)
                                sentence_id  \
0  4f9c8bcd60318b0d1257f35ebc7c4ede9f7930e1   
1  0531b165e42997e8eecbb84d1e774c728041db8c   

                                            sentence label  
0  Die Ausbreitung des Virus sei beschränkter als...   OBJ  
1    Zwar sei ein Anstieg der Zahlen zu verzeichnen.   OBJ  
train_en.tsv: (830, 4)
                            sen

# save the data from drive to colab for easier access


In [None]:
def load_data_local(file_names, base_dir='/content/data'):
    """
    Load data from multiple TSV files in language-specific folders and combine them.

    Args:
        file_names (list): List of TSV filenames (e.g., ['train_ar.tsv', 'test_ar_unlabeled.tsv'])
        base_dir (str): Base directory where language folders are located

    Returns:
        pd.DataFrame: Combined dataframe with all data and language tags
    """
    # Language mapping from filename prefixes to folder names
    language_map = {
        'ar': 'arabic',
        'bg': 'bulgarian',
        'de': 'german',
        'en': 'english',
        'it': 'italian'
    }

    dfs = []
    for file_name in file_names:
        # Determine language code based on known patterns
        if '_train_augmented' in file_name:
            lang_code = file_name.split('_')[0]  # e.g., 'arabic'
            language_folder = lang_code
            lang_tag = {v: k for k, v in language_map.items()}.get(lang_code, lang_code)
        elif '_unlabeled' in file_name:
            lang_code = file_name.split('_')[1]  # e.g., 'ar' in 'test_ar_unlabeled.tsv'
            language_folder = language_map.get(lang_code, lang_code)
            lang_tag = lang_code
        elif '_' in file_name:
            lang_code = file_name.split('_')[1].split('.')[0]
            language_folder = language_map.get(lang_code, lang_code)
            lang_tag = lang_code
        else:
            language_folder = file_name.split('.')[0]
            lang_tag = language_folder

        # Construct full file path
        file_path = os.path.join(base_dir, language_folder, file_name)

        if os.path.exists(file_path):
            try:
                df = pd.read_csv(file_path, sep='\t', header=0)

                # Drop 'solved_conflict' column if it exists
                if 'solved_conflict' in df.columns:
                    df = df.drop(columns=['solved_conflict'])

                # Add language tag
                df['language'] = lang_tag

                dfs.append(df)
                print(f"✅ Successfully loaded {file_name} from {language_folder} folder")
            except Exception as e:
                print(f"❌ Error reading {file_path}: {e}")
        else:
            print(f"❌ File not found: {file_path}")

    if dfs:
        return pd.concat(dfs, ignore_index=True)
    else:
        print("No data was loaded. Returning empty DataFrame.")
        return pd.DataFrame()


In [None]:
# Load training and validation data separately
train_data = load_data_local(train_files)
val_data = load_data_local(validation_files)

# Convert labels to integers
label_map = {'OBJ': 0, 'SUBJ': 1}
train_data['label_id'] = train_data['label'].map(label_map)
val_data['label_id'] = val_data['label'].map(label_map)

# Reset indices
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

print(f"Training with {len(train_data)} examples")
print(f"Validating with {len(val_data)} examples")




✅ Successfully loaded train_ar.tsv from arabic folder
✅ Successfully loaded train_bg.tsv from bulgarian folder
✅ Successfully loaded train_de.tsv from german folder
✅ Successfully loaded train_en.tsv from english folder
✅ Successfully loaded train_it.tsv from italian folder
✅ Successfully loaded dev_ar.tsv from arabic folder
✅ Successfully loaded dev_bg.tsv from bulgarian folder
✅ Successfully loaded dev_de.tsv from german folder
✅ Successfully loaded dev_en.tsv from english folder
✅ Successfully loaded dev_it.tsv from italian folder
Training with 8926 examples
Validating with 2393 examples


In [None]:
train_files = {f: train_data[f] for f in train_files if f in train_data}
for fname, df in train_files.items():
    print(f"{fname}: {df.shape}")
    print(df.head(2))

In [None]:
print(train_data.tail(10))  # Show last 10 rows
print(train_data.dtypes)     # Data types of each column
print(train_data.columns)    # Column names
print(train_data.shape)      # (rows, columns)
print(train_data.tail(10).isnull().sum())  # NaNs in the last 10 rows


                               sentence_id  \
8916  9dc6b4dd-0370-4e70-8ff2-c17db43e1e1a   
8917  2defaff4-710e-4c81-bd2e-0cdbc08327a7   
8918  a0ceb734-05cb-4020-aeef-2d8bd3a5aae4   
8919  d1c914b5-eb77-492d-8cbc-32c80bcda387   
8920  595e2f97-ce1e-4694-ad38-eae671875d00   
8921  0e93f441-9faa-4b64-ad38-690e86b1d0f7   
8922  bdad5c4d-d160-4d8c-9e16-8f26512b8479   
8923  3e6f5ca0-7cfe-4a3b-96b0-b00af2f9186f   
8924  ddae1feb-9fb4-4574-828c-67fe8c3bb063   
8925  5266c6a6-85d6-4ef4-b9dc-cf7987456398   

                                               sentence label language  \
8916  Per questo è importante costruire una cabina d...   OBJ       it   
8917  L'evoluzione di quest'oggi aggiunge un altro t...  SUBJ       it   
8918  Dunque non ha escluso che Pfizer possa avere u...   OBJ       it   
8919  22 persone sono state arrestate nell’ambito di...   OBJ       it   
8920  Ogni singolo messaggio è stato inviato alla Po...   OBJ       it   
8921   Presi due della banda, il più piccolo ha 1

In [None]:
print("First few rows:", train_data.iloc[:5])     # First few original rows
print(":====================================")
print("Last few rows: ", train_data.iloc[-5:])    # Last few (potentially augmented) rows


First few rows:             sentence_id                                           sentence  \
0    AlMasryAlYoum_10_1  عندما تولى الرئيس عبد الفتاح السيسى السلطة فى ...   
1    MAH_62-curl_01_005  في هذا السياق يشرح عيراني أنّ الخوف لا يقتصر ف...   
2   MIS_676-eurl_02_015  وشكك المدير التنفيذي وعميد مجلس المديرين التنف...   
3  MIS_2290-curl_03_018  ﻟﻜﻦ ﻟﻸﺳﻒ ﺍﻟﺮﺻﺎﺻﺔ ﻟﻢ ﺗﺼﺐ ﺃﻣﻪ ﻭ ﺍﺳﺘﻘﺮﺕ ﻓﻲ ﺭأﺱ  ر...   
4   MIS_460-eurl_02_001  الوزارة طالبت بضرورة احترام الإجراءات الوقائية...   

  label language  label_id  
0   OBJ       ar         0  
1   OBJ       ar         0  
2   OBJ       ar         0  
3   OBJ       ar         0  
4   OBJ       ar         0  
Last few rows:                                 sentence_id  \
8921  0e93f441-9faa-4b64-ad38-690e86b1d0f7   
8922  bdad5c4d-d160-4d8c-9e16-8f26512b8479   
8923  3e6f5ca0-7cfe-4a3b-96b0-b00af2f9186f   
8924  ddae1feb-9fb4-4574-828c-67fe8c3bb063   
8925  5266c6a6-85d6-4ef4-b9dc-cf7987456398   

                                               s

In [None]:
import pandas as pd

with pd.option_context('display.max_rows', 10, 'display.max_columns', None, 'display.width', 1000):
    print(train_data.tail(30))


                               sentence_id                                           sentence label language  label_id
8896  eca73829-ad31-4b74-8412-6c466ea901e9  I tempi sono invece più lunghi per il prossimo...   OBJ       it         0
8897  b79e9ad4-8989-43c1-b1fd-f348218319f5  Tra i settori in calo soprattutto gli autonomi...   OBJ       it         0
8898  3c799a87-a5df-4eb6-95a7-dc7673ac7d27  Pfizer è pronta a far decollare verso tutti i ...   OBJ       it         0
8899  d2850f01-e772-4248-8c6f-2e59ab0ca5fa   Creata nel 2013 da due neolaureati di Princet...   OBJ       it         0
8900  ec8dcabb-3245-4601-8f4f-a31081cc45a6  Nuovo picco di ricoverati Covid in Umbria, ogg...   OBJ       it         0
...                                    ...                                                ...   ...      ...       ...
8921  0e93f441-9faa-4b64-ad38-690e86b1d0f7   Presi due della banda, il più piccolo ha 15 anni   OBJ       it         0
8922  bdad5c4d-d160-4d8c-9e16-8f26512b8479  Il c

## Making the dataset

In [None]:
import torch
from torch.utils.data import Dataset

class SubjectivityDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        """
        Dataset for subjectivity classification

        Args:
            dataframe: DataFrame containing text and labels
            tokenizer: Tokenizer to use for encoding
            max_length: Maximum sequence length
        """
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['sentence'].tolist()  # Adjust column name if needed

        # Convert string labels to integers if needed
        labels = dataframe['label'].tolist()  # Adjust column name if needed
        self.targets = []



        for label in labels:
            if isinstance(label, str):
                # Common string label conversions
                if label.lower() in ['subj', "SUBJ"]: # Add your expected string labels for class 1
                    self.targets.append(1)
                elif label.lower() in ['obj', "OBJ"]: # Add your expected string labels for class 0
                    self.targets.append(0)
                else:
                    raise ValueError(f"Unknown label: {label}") # Raise an error for unexpected labels
            else:
                # If it's already an integer or float
                self.targets.append(int(label))
        self.max_length = max_length

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        target = self.targets[index]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(target, dtype=torch.long)
        }

In [None]:
# Run this code before training to inspect your data labels
print("Inspecting train data...")
print("\nLabel column name and first few values:")
for col in train_data.columns:
    if 'label' in col.lower():
        print(f"Column: {col}")
        print(f"Values: {train_data[col].head(5).tolist()}")
        print(f"Type: {type(train_data[col].iloc[0])}")

print("\nChecking for missing values:")
print(train_data.isnull().sum())

print("\nChecking data types:")
print(train_data.dtypes)

print("\nUnique labels:")
for col in train_data.columns:
    if 'label' in col.lower():
        print(f"{col}: {train_data[col].unique()}")

# If needed, convert labels to integers
if 'label' in train_data.columns and train_data['label'].dtype == 'object':
    print("\nConverting string labels to integers...")
    # Map function for label conversion - adjust based on your actual labels
    def convert_label(label):
        if isinstance(label, str):
            if label.lower() in ['subjective', 'subj', 'positive', 'pos', 'yes', 'true', '1']:
                return 1
            else:
                return 0
        return int(label)

    # Convert and show sample results
    train_data['label'] = train_data['label'].apply(convert_label)
    val_data['label'] = val_data['label'].apply(convert_label)

    print("After conversion:")
    print(f"Train labels: {train_data['label'].head(5).tolist()}")
    print(f"Type: {type(train_data['label'].iloc[0])}")
    print(f"Unique values: {train_data['label'].unique()}")

Inspecting train data...

Label column name and first few values:
Column: label
Values: ['OBJ', 'OBJ', 'OBJ', 'OBJ', 'OBJ']
Type: <class 'str'>
Column: label_id
Values: [0, 0, 0, 0, 0]
Type: <class 'numpy.int64'>

Checking for missing values:
sentence_id    0
sentence       0
label          0
language       0
label_id       0
dtype: int64

Checking data types:
sentence_id    object
sentence       object
label          object
language       object
label_id        int64
dtype: object

Unique labels:
label: ['OBJ' 'SUBJ']
label_id: [0 1]

Converting string labels to integers...
After conversion:
Train labels: [0, 0, 0, 0, 0]
Type: <class 'numpy.int64'>
Unique values: [0 1]


In [None]:
import pandas as pd

with pd.option_context('display.max_rows', 10, 'display.max_columns', None, 'display.width', 1000):
    print(train_data.head(10))


            sentence_id                                           sentence  label language  label_id
0    AlMasryAlYoum_10_1  عندما تولى الرئيس عبد الفتاح السيسى السلطة فى ...      0       ar         0
1    MAH_62-curl_01_005  في هذا السياق يشرح عيراني أنّ الخوف لا يقتصر ف...      0       ar         0
2   MIS_676-eurl_02_015  وشكك المدير التنفيذي وعميد مجلس المديرين التنف...      0       ar         0
3  MIS_2290-curl_03_018  ﻟﻜﻦ ﻟﻸﺳﻒ ﺍﻟﺮﺻﺎﺻﺔ ﻟﻢ ﺗﺼﺐ ﺃﻣﻪ ﻭ ﺍﺳﺘﻘﺮﺕ ﻓﻲ ﺭأﺱ  ر...      0       ar         0
4   MIS_460-eurl_02_001  الوزارة طالبت بضرورة احترام الإجراءات الوقائية...      0       ar         0
5  MIS_1955-eurl_01_024  نحن نتحدث عن كيان صهيوني كل الناس يعرفون ماذا ...      1       ar         1
6  MIS_2585-eurl_01_013  - اعرف مكان أقرب مخرج طوارئ، وعُدّ كم صفا من ا...      0       ar         0
7  MIS_1466-eurl_04_011  تعرضت لضغوطات نفسية كثيرة أثناء جلسات التحقيق ...      0       ar         0
8               AJ_2_12  وقالت أخرى: "كنت هناك للتو، وفجأة حدث لي شيء ف...      1       ar 

In [None]:
print(train_data.dtypes)     # Data types of each column
print(train_data.columns)    # Column names
print(train_data.shape)      # (rows, columns)

print(train_data.head(1))

sentence_id    object
sentence       object
label           int64
language       object
label_id        int64
dtype: object
Index(['sentence_id', 'sentence', 'label', 'language', 'label_id'], dtype='object')
(8926, 5)
          sentence_id                                           sentence  \
0  AlMasryAlYoum_10_1  عندما تولى الرئيس عبد الفتاح السيسى السلطة فى ...   

   label language  label_id  
0      0       ar         0  


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    """
    Compute evaluation metrics for model predictions

    Args:
        pred: Prediction object from Trainer

    Returns:
        dict: Dictionary containing accuracy, F1 score, precision and recall
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

def train_model(train_dataset, val_dataset, BATCH_SIZE=4, LEARNING_RATE=2e-5, EPOCHS=3):
    """
    Train the model using the provided datasets

    Args:
        train_dataset: Training dataset
        val_dataset: Validation dataset
        BATCH_SIZE: Batch size for training
        LEARNING_RATE: Learning rate
        EPOCHS: Number of training epochs

    Returns:
        model: Trained model
        trainer: Trainer object
    """
    # Model name
    MODEL_NAME = MODEL_NAME_

    # Load model with config for binary classification
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=2,  # Binary classification
        ignore_mismatched_sizes=True  # Handle potential mismatches
    )

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
     # Replace 'evaluation_strategy' with 'eval_strategy'
        eval_strategy="epoch",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        save_strategy="epoch",
        logging_dir="./logs",
        report_to='none'
    )

    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the validation set
    eval_result = trainer.evaluate()
    print(f"Evaluation results: {eval_result}")

    return model, trainer

In [None]:


# 3. Set hyperparameters
MAX_LENGTH = 64
BATCH_SIZE = 4  # Consider reducing if you have memory issues
LEARNING_RATE = 3e-5
EPOCHS = 8



In [None]:
# 4. Load tokenizer
from transformers import AutoTokenizer
MODEL_NAME = MODEL_NAME_
print(f"Loading tokenizer for {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)





Loading tokenizer for GroNLP/mdebertav3-subjectivity-multilingual...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [None]:
tokenizer("sample")

{'input_ids': [1, 36850, 2], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [None]:
# 5. Create datasets
print("Creating datasets...")
train_dataset = SubjectivityDataset(train_data, tokenizer, MAX_LENGTH)
val_dataset = SubjectivityDataset(val_data, tokenizer, MAX_LENGTH)
print(f"Dataset creation complete. Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}")

Creating datasets...
Dataset creation complete. Train size: 8926, Validation size: 2393


In [None]:
# 6. Train model
print("Starting model training...")
model, trainer = train_model(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    BATCH_SIZE=BATCH_SIZE,
    LEARNING_RATE=LEARNING_RATE,
    EPOCHS=EPOCHS
)



print("Training and evaluation complete!")

In [None]:
from transformers import AutoTokenizer

# Set save path
save_dir = '/content/drive/MyDrive/saved_model'

# Save using trainer
trainer.save_model(save_dir)         # Saves model, config, and tokenizer (if attached)
tokenizer.save_pretrained(save_dir) # Save tokenizer separately if not already included

print(f"✅ Model and tokenizer saved to {save_dir}")


## Code for Loading the trained model

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
save_dir = '/content/drive/MyDrive/saved_model'
# Load from the same path
model = AutoModelForSequenceClassification.from_pretrained(save_dir)
tokenizer = AutoTokenizer.from_pretrained(save_dir)

# Move model to correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("✅ Model and tokenizer successfully loaded from Drive")


✅ Model and tokenizer successfully loaded from Drive


# Load the evaluation data

In [None]:
import os
import shutil

# Language code to folder name mapping
language_map = {
    'ar': 'arabic',
    'bg': 'bulgarian',
    'de': 'german',
    'en': 'english',
    'it': 'italian'
}

# Copy dev_test and unlabeled test files for each language
for lang_code, lang_name in language_map.items():
    source_dir = f"/content/drive/MyDrive/data/{lang_name}"
    target_dir = f"/content/data/{lang_name}/eval"
    os.makedirs(target_dir, exist_ok=True)

    # Define filenames
    dev_test_file = f"dev_test_{lang_code}.tsv"
    unlabeled_file = f"test_{lang_code}_unlabeled.tsv"

    for file_name in [dev_test_file, unlabeled_file]:
        source_file = os.path.join(source_dir, file_name)
        target_file = os.path.join(target_dir, file_name)

        if os.path.exists(source_file):
            shutil.copy(source_file, target_file)
            print(f"✅ Copied {file_name} to {target_dir}")
        else:
            print(f"❌ File not found: {source_file}")


✅ Copied dev_test_ar.tsv to /content/data/arabic/eval
✅ Copied test_ar_unlabeled.tsv to /content/data/arabic/eval
✅ Copied dev_test_bg.tsv to /content/data/bulgarian/eval
❌ File not found: /content/drive/MyDrive/data/bulgarian/test_bg_unlabeled.tsv
✅ Copied dev_test_de.tsv to /content/data/german/eval
✅ Copied test_de_unlabeled.tsv to /content/data/german/eval
✅ Copied dev_test_en.tsv to /content/data/english/eval
✅ Copied test_en_unlabeled.tsv to /content/data/english/eval
✅ Copied dev_test_it.tsv to /content/data/italian/eval
✅ Copied test_it_unlabeled.tsv to /content/data/italian/eval


In [None]:
# Create a Dataset class for test data
class TestSubjectivityDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['sentence']
        label = self.data.iloc[idx]['label_id']

        # Tokenize the sentence
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Remove the batch dimension
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}

        # Add the label and sentence ID
        encoding['labels'] = torch.tensor(label, dtype=torch.long)
        encoding['sentence_idx'] = idx

        return encoding

## Just for testing predictions on all languages

In [None]:
from tqdm import tqdm
import csv
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_on_test_set(test_file_path, model, tokenizer, max_length=128, batch_size=16):
    # Load test data
    test_data = pd.read_csv(test_file_path, sep='\t')
    print(f"Loaded test data with {len(test_data)} examples")
    print(f"Columns: {test_data.columns.tolist()}")

    # Map labels to IDs
    test_data['label_id'] = test_data['label'].map({'OBJ': 0, 'SUBJ': 1})

    # Create dataset and dataloader
    test_dataset = TestSubjectivityDataset(test_data, tokenizer, max_length)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    # Evaluation mode
    model.eval()

    # Lists to store results
    all_predictions = []
    all_pred_labels = []
    all_true_labels = []
    all_indices = []

    # Perform predictions
    with torch.no_grad():
        for batch in test_dataloader:
            # Get the sentence indices
            indices = batch.pop('sentence_idx')

            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items() if k != 'sentence_idx'}

            # Get predictions
            outputs = model(**batch)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            pred_classes = torch.argmax(predictions, dim=1)

            # Store results
            all_predictions.extend(predictions.cpu().numpy())
            all_pred_labels.extend(pred_classes.cpu().numpy())
            all_true_labels.extend(batch['labels'].cpu().numpy())
            all_indices.extend(indices.numpy())

    # Convert to text labels
    pred_text_labels = ["OBJ" if p == 0 else "SUBJ" for p in all_pred_labels]
    true_text_labels = ["OBJ" if t == 0 else "SUBJ" for t in all_true_labels]

    # Create results dataframe
    results_df = pd.DataFrame({
        'sentence_id': [test_data.iloc[idx]['sentence_id'] for idx in all_indices],
        'sentence': [test_data.iloc[idx]['sentence'] for idx in all_indices],
        'label': pred_text_labels  # This is now 'label' as per required format
    })

    # Save predictions in format matching the evaluation script
    output_path = f"predictions_{test_file_path.split('/')[-1]}"
    results_df[['sentence_id', 'label']].to_csv(output_path, sep='\t', index=False, quoting=csv.QUOTE_NONE)
    print(f"\nPredictions saved to {output_path}")

    # Create a gold standard dataframe (for local evaluation)
    gold_df = pd.DataFrame({
        'sentence_id': [test_data.iloc[idx]['sentence_id'] for idx in all_indices],
        'label': true_text_labels
    })

    # Calculate scores using the official evaluation function
    whole_data = pd.DataFrame({
        'sentence_id': results_df['sentence_id'],
        'pred_label': results_df['label'],
        'gold_label': gold_df['label']
    })

    # Calculate evaluation metrics matching the official script
    pred_values = whole_data['pred_label'].values
    gold_values = whole_data['gold_label'].values

    acc = accuracy_score(gold_values, pred_values)
    m_prec, m_rec, m_f1, m_s = precision_recall_fscore_support(gold_values, pred_values, average="macro", zero_division=0)
    p_prec, p_rec, p_f1, p_s = precision_recall_fscore_support(gold_values, pred_values, labels=["SUBJ"], zero_division=0)

    scores = {
        'macro-F1': m_f1,
        'macro-P': m_prec,
        'macro-R': m_rec,
        'SUBJ-F1': p_f1[0],
        'SUBJ-P': p_prec[0],
        'SUBJ-R': p_rec[0],
        'accuracy': acc
    }

    # Print results
    print(f"\n===== Model Performance on {test_file_path} =====")
    print(f"""
        macro-F1: {scores['macro-F1']:.4f}
        macro-P: {scores['macro-P']:.4f}
        macro-R: {scores['macro-R']:.4f}

        SUBJ-F1: {scores['SUBJ-F1']:.4f}
        SUBJ-P: {scores['SUBJ-P']:.4f}
        SUBJ-R: {scores['SUBJ-R']:.4f}

        accuracy: {scores['accuracy']:.4f}
    """)

    # Confusion matrix
    cm = confusion_matrix(gold_values, pred_values, labels=["OBJ", "SUBJ"])
    print("\nConfusion Matrix:")
    print("              Predicted")
    print("             OBJ    SUBJ")
    print(f"Actual OBJ  {cm[0,0]:4d}   {cm[0,1]:4d}")
    print(f"      SUBJ  {cm[1,0]:4d}   {cm[1,1]:4d}")

    # Error analysis - find examples where model was wrong
    errors_df = whole_data[whole_data['pred_label'] != whole_data['gold_label']]
    if not errors_df.empty:
        error_output_path = f"errors_{test_file_path.split('/')[-1]}"
        # Join with original data to get sentences
        error_data = pd.merge(errors_df, test_data[['sentence_id', 'sentence']], on='sentence_id')
        error_data.to_csv(error_output_path, sep='\t', index=False)
        print(f"Examples of misclassifications saved to {error_output_path}")

        # Print a few examples of misclassifications
        print("\nExamples of misclassifications:")
        sample_errors = error_data.sample(min(5, len(error_data)))
        for _, row in sample_errors.iterrows():
            print(f"Sentence ID: {row['sentence_id']}")
            print(f"Sentence: {row['sentence']}")
            print(f"True: {row['gold_label']}, Predicted: {row['pred_label']}")
            print("")

    return results_df, scores

In [None]:


# Language code to folder name mapping
language_map = {
    'ar': 'arabic',
    'bg': 'bulgarian',
    'de': 'german',
    'en': 'english',
    'it': 'italian'
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
final = {}
# Loop through each language and evaluate its test set
for lang_code, lang_name in language_map.items():
    test_file_name = f"dev_test_{lang_code}.tsv"
    test_file_path = os.path.join("/content/data", lang_name, "eval", test_file_name)

    if os.path.exists(test_file_path):
        print(f"🔍 Evaluating on {test_file_name}...")
        results, metrics = evaluate_on_test_set(test_file_path, model, tokenizer)
        final[lang_name] = metrics
    else:
        print(f"❌ File not found: {test_file_path}")

print("\n✅ Evaluation complete for all available test sets.")



🔍 Evaluating on dev_test_ar.tsv...
Loaded test data with 748 examples
Columns: ['sentence_id', 'sentence', 'label']

Predictions saved to predictions_dev_test_ar.tsv

===== Model Performance on /content/data/arabic/eval/dev_test_ar.tsv =====

        macro-F1: 0.5222
        macro-P: 0.5276
        macro-R: 0.5280

        SUBJ-F1: 0.5062
        SUBJ-P: 0.4575
        SUBJ-R: 0.5666

        accuracy: 0.5227
    

Confusion Matrix:
              Predicted
             OBJ    SUBJ
Actual OBJ   208    217
      SUBJ   140    183
Examples of misclassifications saved to errors_dev_test_ar.tsv

Examples of misclassifications:
Sentence ID: MIS_1617-eurl_02_004
Sentence: وطمأن الشاب الجميع بأنه لا يوجد مصابين عرب أخرين في ووهان غيره، وعن المرض قال : “هذا المرض سهل ممتنع ويكاد يشبه الإنفولنزا ولكن بأكثر حدة وألم”، قائلا : “الأعراض صعبة ومؤلمة.
True: SUBJ, Predicted: OBJ

Sentence ID: MIS_195-eurl_05_009
Sentence: مع أخذ ذلك في الاعتبار، أعلن ولي العهد، محمد بن سلمان، في عام 2017 عن رؤية السعو

In [None]:
for lang, data in final.items():
  print("Language:", lang, " has metrics ", data)
  print(":======================================:")

Language: arabic  has metrics  {'macro-F1': 0.5221935338418513, 'macro-P': 0.5276005747126438, 'macro-R': 0.5279876160990712, 'SUBJ-F1': np.float64(0.5062240663900415), 'SUBJ-P': np.float64(0.4575), 'SUBJ-R': np.float64(0.56656346749226), 'accuracy': 0.5227272727272727}
Language: bulgarian  has metrics  {'macro-F1': 0.7423565784953624, 'macro-P': 0.7426624737945493, 'macro-R': 0.7420756813280178, 'SUBJ-F1': np.float64(0.704225352112676), 'SUBJ-P': np.float64(0.7075471698113207), 'SUBJ-R': np.float64(0.7009345794392523), 'accuracy': 0.748}
Language: german  has metrics  {'macro-F1': 0.8524762908324552, 'macro-P': 0.8418112568768514, 'macro-R': 0.8717665469943846, 'SUBJ-F1': np.float64(0.8076923076923077), 'SUBJ-P': np.float64(0.7411764705882353), 'SUBJ-R': np.float64(0.8873239436619719), 'accuracy': 0.8660714285714286}
Language: english  has metrics  {'macro-F1': 0.7277840269966254, 'macro-P': 0.7225378787878788, 'macro-R': 0.734127343537723, 'SUBJ-F1': np.float64(0.5984251968503937), '

## Prediction on Multilingual Dataset (unlabelled)

In [None]:
import os
import csv
import torch
import pandas as pd
from tqdm import tqdm

def predict_on_unlabelled(file_path, model, tokenizer, device, max_length=256, batch_size=16, output_path=None):
    """
    Predict labels for unlabelled TSV file and save in format ready for evaluation.

    Args:
        file_path (str): Path to the unlabelled TSV file.
        model: Trained transformer model.
        tokenizer: Associated tokenizer.
        device: CUDA or CPU.
        max_length (int): Max token length for each input.
        batch_size (int): Batch size for inference.
        output_path (str, optional): Path to save predictions. If None, uses "predictions_[filename]".

    Returns:
        pd.DataFrame: DataFrame with sentence_id and label columns.
    """
    # Read the unlabelled data
    df = pd.read_csv(file_path, sep='\t', quoting=csv.QUOTE_NONE)

    # Check required columns
    if 'sentence' in df.columns:
        raise ValueError(f" 'sentence' column found in {file_path}")
    if 'sentence_id' not in df.columns:
        raise ValueError(f"No 'sentence_id' column found in {file_path}")

    # Prepare data
    texts = df['sentence'].tolist()
    sentence_ids = df['sentence_id'].tolist()
    predictions = []

    # Set model to evaluation mode
    model.to(device)
    model.eval()

    # Make predictions in batches
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc=f"Predicting {os.path.basename(file_path)}"):
            batch_texts = texts[i:i + batch_size]
            encoded = tokenizer(batch_texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
            encoded = {k: v.to(device) for k, v in encoded.items()}

            outputs = model(**encoded)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())

    # Convert numeric predictions to text labels
    label_map = {0: "OBJ", 1: "SUBJ"}
    text_predictions = [label_map[pred] for pred in predictions]

    # Create results dataframe with required columns
    results_df = pd.DataFrame({
        'sentence_id': sentence_ids,
        'label': text_predictions
    })

    # Save predictions in required format
    if output_path is None:
        output_path = f"predictions_{os.path.basename(file_path)}"

    results_df.to_csv(output_path, sep='\t', index=False, quoting=csv.QUOTE_NONE)
    print(f"Predictions saved to {output_path}")

    return results_df

In [None]:
import os
import torch
import csv
import zipfile
from google.colab import files  # For downloading in Colab

# Define the unlabelled test file and setting
unlabelled_test_file = "test_multilingual_unlabeled"
setting = "multilingual"

# Set directories
data_dir = "/content/drive/MyDrive/data/multilingual"
output_dir = "/content/output"

# Make sure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define input and output paths
input_path = os.path.join(data_dir, f"{unlabelled_test_file}.tsv")
output_file_name = f"task1_test_{setting}.tsv"
output_path = os.path.join(output_dir, output_file_name)

# Run predictions
if os.path.exists(input_path):
    print(f"🔍 Predicting on {unlabelled_test_file}.tsv...")

    # Call the predict_on_unlabelled function with the output path
    predict_on_unlabelled(
        file_path=input_path,
        model=model,
        tokenizer=tokenizer,
        device=device,
        output_path=output_path
    )

    print(f"✅ Predictions saved to {output_path}")

    # Zip the TSV file
    zip_path = os.path.join(output_dir, f"task1_test_{setting}.zip")
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(output_path, output_file_name)  # Add TSV to ZIP with just the file name

    print(f"📦 Zipped to {zip_path}")

    # Download the ZIP file
    files.download(zip_path)
else:
    print(f"❌ File not found: {input_path}")

print("📊 Prediction and zipping completed!")

In [None]:
def predict_subjectivity(model, tokenizer, sentences):
    # Tokenize sentences
    inputs = tokenizer(
        sentences,
        padding=True,
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    # Get predictions
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)

    # Convert predictions to "OBJ" or "SUBJ"
    prediction_labels = ["OBJ" if p == 0 else "SUBJ" for p in predictions.cpu().numpy()]

    return prediction_labels

In [None]:
# Create a Dataset class for test data
class TestSubjectivityDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['sentence']
        label = self.data.iloc[idx]['label_id']

        # Tokenize the sentence
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Remove the batch dimension
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}

        # Add the label and sentence ID
        encoding['labels'] = torch.tensor(label, dtype=torch.long)
        encoding['sentence_idx'] = idx

        return encoding

In [None]:
def evaluate_on_test_set(test_file_path, model, tokenizer, max_length=128, batch_size=16):
    # Load test data
    test_data = pd.read_csv(test_file_path, sep='\t')
    print(f"Loaded test data with {len(test_data)} examples")
    print(f"Columns: {test_data.columns.tolist()}")

    # Map labels to IDs
    test_data['label_id'] = test_data['label'].map({'OBJ': 0, 'SUBJ': 1})

    # Create dataset and dataloader
    test_dataset = TestSubjectivityDataset(test_data, tokenizer, max_length)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    # Evaluation mode
    model.eval()

    # Lists to store results
    all_predictions = []
    all_pred_labels = []
    all_true_labels = []
    all_indices = []

    # Perform predictions
    with torch.no_grad():
        for batch in test_dataloader:
            # Get the sentence indices
            indices = batch.pop('sentence_idx')

            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items() if k != 'sentence_idx'}

            # Get predictions
            outputs = model(**batch)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            pred_classes = torch.argmax(predictions, dim=1)

            # Store results
            all_predictions.extend(predictions.cpu().numpy())
            all_pred_labels.extend(pred_classes.cpu().numpy())
            all_true_labels.extend(batch['labels'].cpu().numpy())
            all_indices.extend(indices.numpy())

    # Convert to text labels
    pred_text_labels = ["OBJ" if p == 0 else "SUBJ" for p in all_pred_labels]
    true_text_labels = ["OBJ" if t == 0 else "SUBJ" for t in all_true_labels]

    # Create results dataframe
    results_df = pd.DataFrame({
        'sentence_id': [test_data.iloc[idx]['sentence_id'] for idx in all_indices],
        'sentence': [test_data.iloc[idx]['sentence'] for idx in all_indices],
        'true_label': true_text_labels,
        'predicted_label': pred_text_labels,
        'obj_score': [round(p[0], 4) for p in all_predictions],
        'subj_score': [round(p[1], 4) for p in all_predictions],
    })

    # Calculate metrics
    accuracy = accuracy_score(all_true_labels, all_pred_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_true_labels, all_pred_labels, average='weighted'
    )

    # Detailed report
    class_report = classification_report(all_true_labels, all_pred_labels,
                                         target_names=['OBJ', 'SUBJ'], output_dict=True)

    # Print detailed metrics
    print(f"\n===== Model Performance on {test_file_path} =====")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score (weighted): {f1:.4f}")
    print(f"Precision (weighted): {precision:.4f}")
    print(f"Recall (weighted): {recall:.4f}\n")

    # Print per-class metrics
    print("Class-wise Performance:")
    print(f"OBJ - Precision: {class_report['OBJ']['precision']:.4f}, "
          f"Recall: {class_report['OBJ']['recall']:.4f}, "
          f"F1: {class_report['OBJ']['f1-score']:.4f}")
    print(f"SUBJ - Precision: {class_report['SUBJ']['precision']:.4f}, "
          f"Recall: {class_report['SUBJ']['recall']:.4f}, "
          f"F1: {class_report['SUBJ']['f1-score']:.4f}")

    # Confusion matrix
    cm = confusion_matrix(all_true_labels, all_pred_labels)
    print("\nConfusion Matrix:")
    print("              Predicted")
    print("             OBJ    SUBJ")
    print(f"Actual OBJ  {cm[0,0]:4d}   {cm[0,1]:4d}")
    print(f"      SUBJ  {cm[1,0]:4d}   {cm[1,1]:4d}")

    # Save results
    output_path = f"predictions_{test_file_path.split('/')[-1]}"
    results_df.to_csv(output_path, sep='\t', index=False)
    print(f"\nDetailed predictions saved to {output_path}")

    # Error analysis - find examples where model was wrong
    errors_df = results_df[results_df['true_label'] != results_df['predicted_label']]
    if not errors_df.empty:
        error_output_path = f"errors_{test_file_path.split('/')[-1]}"
        errors_df.to_csv(error_output_path, sep='\t', index=False)
        print(f"Examples of misclassifications saved to {error_output_path}")

        # Print a few examples of misclassifications
        print("\nExamples of misclassifications:")
        sample_errors = errors_df.sample(min(5, len(errors_df)))
        for _, row in sample_errors.iterrows():
            print(f"Sentence ID: {row['sentence_id']}")
            print(f"Sentence: {row['sentence']}")
            print(f"True: {row['true_label']}, Predicted: {row['predicted_label']}")
            print(f"Confidence scores - OBJ: {row['obj_score']}, SUBJ: {row['subj_score']}")
            print("")

    return results_df, {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'class_report': class_report
    }

In [None]:
# # Train the model
# model, tokenizer = train_multilingual_subjectivity_classifier()

In [None]:
# from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

# data_dir = "/content/drive/MyDrive/data/multilingual"
# output_dir = "/content/output"


# language_map = {
#         'ar': 'arabic',
#         'bg': 'bulgarian',
#         'de': 'german',
#         'en': 'english',
#         'it': 'italian'
# }

# test_list = ["dev_test_it.tsv", "dev_test_en.tsv", "dev_test_de.tsv", "dev_test_ar.tsv", "dev_test_bg.tsv"]
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# for test_file in test_list:
#     test_file_p = test_file.split("_")[-1].split(".")[0]
#     test_file_path = os.path.join(data_dir, language_map[test_file_p], "eval", test_file)
#     print(f"Evaluating on {test_file_path}...")
#     results, metrics = evaluate_on_test_set(test_file, model, tokenizer)

# # If you want to further analyze per-language performance, you can add that logic here
# print("\nEvaluation complete!")

In [None]:
# # Save to Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# # Define the save path
# save_directory = "/content/drive/MyDrive/multilingual-subjectivity-classifier-model"

# # Save model and tokenizer
# model.save_pretrained(save_directory)
# tokenizer.save_pretrained(save_directory)

# print(f"Model successfully saved to {save_directory}")

## Prediction on unlabelled dataset (Zero Shot)

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# Define dataset class for unlabelled data
class UnlabelledDataset(Dataset):
        def __init__(self, data, tokenizer, max_length):
            self.data = data
            self.tokenizer = tokenizer
            self.max_length = max_length

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            sentence = self.data.iloc[idx]['sentence']
            sentence_id = self.data.iloc[idx]['sentence_id'] if 'sentence_id' in self.data.columns else idx

            encoding = self.tokenizer(
                sentence,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            # Remove batch dimension
            encoding = {k: v.squeeze(0) for k, v in encoding.items()}

            # Add sentence index for tracking
            encoding['sentence_idx'] = torch.tensor(idx)

            return encoding

In [None]:
def predict_on_unlabelled(data_file_path, model, tokenizer, max_length=128, batch_size=16):
    """
    Run predictions on unlabelled data and save the results.

    Args:
        data_file_path: Path to the unlabelled data file (CSV/TSV)
        model: The trained model
        tokenizer: Tokenizer for the model
        max_length: Maximum sequence length
        batch_size: Batch size for inference

    Returns:
        DataFrame with predictions
    """


    # Get device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load data
    try:
        # Try with tab separator first
        data = pd.read_csv(data_file_path, sep='\t')
    except:
        # Fall back to comma separator
        data = pd.read_csv(data_file_path)

    print(f"Loaded data with {len(data)} examples")
    print(f"Columns: {data.columns.tolist()}")

    # Ensure 'sentence' column exists
    if 'sentence' not in data.columns:
        raise ValueError("Data must contain a 'sentence' column")

    # Create dataset and dataloader
    dataset = UnlabelledDataset(data, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    # Evaluation mode
    model.eval()

    # Lists to store results
    all_predictions = []
    all_pred_labels = []
    all_indices = []

    # Perform predictions
    with torch.no_grad():
        for batch in dataloader:
            # Get the sentence indices
            indices = batch.pop('sentence_idx')

            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}

            # Get predictions
            outputs = model(**batch)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            pred_classes = torch.argmax(predictions, dim=1)

            # Store results
            all_predictions.extend(predictions.cpu().numpy())
            all_pred_labels.extend(pred_classes.cpu().numpy())
            all_indices.extend(indices.numpy())

    # Convert to text labels
    pred_text_labels = ["OBJ" if p == 0 else "SUBJ" for p in all_pred_labels]

    # Create results dataframe
    results_df = pd.DataFrame({
        'sentence_id': [data.iloc[idx]['sentence_id'] for idx in all_indices],
        'label': pred_text_labels,

    })




    # Save results
    output_path = f"predictions_{data_file_path.split('/')[-1]}"
    results_df.to_csv(output_path, sep='\t', index=False)
    print(f"\nPredictions saved to {output_path}")

    # Print distribution of predictions
    obj_count = sum(1 for label in pred_text_labels if label == "OBJ")
    subj_count = sum(1 for label in pred_text_labels if label == "SUBJ")
    total = len(pred_text_labels)

    print("\nPrediction Distribution:")
    print(f"OBJ: {obj_count} ({obj_count/total:.2%})")
    print(f"SUBJ: {subj_count} ({subj_count/total:.2%})")


    return results_df

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

data_dir = "/content/drive/MyDrive/data/"
output_dir = "/content/output"


language_map = {
        'pol': 'polish',
        'ukr': 'ukrainian',
        'ro': 'romanian',
        'gr' : 'greek'
}

test_list = ["test_pol_unlabeled.tsv", "test_ukr_unlabeled.tsv", "test_ro_unlabeled.tsv", "test_gr_unlabeled.tsv"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


for test_file in test_list:
    test_file_p = test_file.split("_")[1].strip()
    test_file_path = os.path.join(data_dir, language_map[test_file_p], test_file)
    print(f"Evaluating on {test_file_path}...")
    results = predict_on_unlabelled(test_file_path, model, tokenizer)

# If you want to further analyze per-language performance, you can add that logic here
print("\nEvaluation complete!")

Evaluating on /content/drive/MyDrive/data/polish/test_pol_unlabeled.tsv...
Using device: cuda
Loaded data with 351 examples
Columns: ['sentence_id', 'sentence']

Predictions saved to predictions_test_pol_unlabeled.tsv

Prediction Distribution:
OBJ: 296 (84.33%)
SUBJ: 55 (15.67%)
Evaluating on /content/drive/MyDrive/data/ukrainian/test_ukr_unlabeled.tsv...
Using device: cuda
Loaded data with 297 examples
Columns: ['sentence_id', 'sentence']

Predictions saved to predictions_test_ukr_unlabeled.tsv

Prediction Distribution:
OBJ: 195 (65.66%)
SUBJ: 102 (34.34%)
Evaluating on /content/drive/MyDrive/data/romanian/test_ro_unlabeled.tsv...
Using device: cuda
Loaded data with 206 examples
Columns: ['sentence_id', 'sentence']

Predictions saved to predictions_test_ro_unlabeled.tsv

Prediction Distribution:
OBJ: 137 (66.50%)
SUBJ: 69 (33.50%)
Evaluating on /content/drive/MyDrive/data/greek/test_gr_unlabeled.tsv...
Using device: cuda
Loaded data with 284 examples
Columns: ['sentence_id', 'sentence