In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/subtask2gemmatraintest/train.csv
/kaggle/input/subtask2gemmatraintest/test.csv


In [2]:
df=pd.read_csv('/kaggle/input/subtask2gemmatraintest/train.csv')
# df=df[:100]
max_length = df['text'].str.len().max()
print("Max string length:", max_length)


Max string length: 879


In [3]:
!pip install transformers
!pip install peft
!pip install datasets
!pip install bitsandbytes
!pip install accelerate
!pip install scikit-learn
!pip uninstall -y trl

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)

In [4]:
# DoRA version - Multi-label classification for Arabic text
import os
import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

os.environ["HF_TOKEN"] ='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'
os.environ["WANDB_DISABLED"] = "true"

# Model configuration
model_id = "google/gemma-7b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Define label mappings for each task
emotion_labels = ['anger', 'disgust', 'neutral', 'love', 'joy', 'anticipation', 
                  'optimism', 'sadness', 'confidence', 'pessimism', 'surprise', 'fear']
emotion_mapping = {label: idx for idx, label in enumerate(emotion_labels)}
emotion_id2label = {idx: label for idx, label in enumerate(emotion_labels)}

offensive_mapping = {"no": 0, "yes": 1}
hate_mapping = {"not_hate": 0, "hate": 1}

# Total labels: 12 emotions + 1 offensive + 1 hate = 14
num_labels = len(emotion_labels) + 2

model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"": 0},
    token=os.environ['HF_TOKEN'],
    num_labels=num_labels,
    torch_dtype=torch.bfloat16,
    problem_type="multi_label_classification"
)

# DoRA configuration
dora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=4,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
    use_dora=True
)

model = get_peft_model(model, dora_config)

# Load and preprocess data
# df = pd.read_csv('/kaggle/input/subtask-1-datasets/train.csv')
data = Dataset.from_pandas(df)

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        truncation=True,
        padding=False,
        max_length=1024,
        return_tensors=None
    )
    
    # Create multi-label targets
    labels = []
    for i in range(len(examples["text"])):
        label_vector = [0.0] * num_labels
        
        # Emotion label (one-hot)
        emotion = examples["Emotion"][i]
        if emotion in emotion_mapping:
            label_vector[emotion_mapping[emotion]] = 1.0
        
        # Offensive label
        offensive = examples["Offensive"][i]
        if offensive == "yes":
            label_vector[12] = 1.0  # Position 12 for offensive
        
        # Hate label (only if offensive is yes)
        if offensive == "yes":
            hate = examples["Hate"][i]
            if hate == "hate":
                label_vector[13] = 1.0  # Position 13 for hate
        
        labels.append(label_vector)
    
    model_inputs["labels"] = labels
    return model_inputs

tokenized_data = data.map(
    preprocess_function,
    batched=True,
    remove_columns=["text", "Emotion", "Offensive", "Hate"]
)

train_dataset = tokenized_data
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, return_tensors="pt")

training_args = TrainingArguments(
    output_dir="./arabic_multilabel_outputs",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    num_train_epochs=3,
    learning_rate=1e-4,
    logging_steps=10,
    save_steps=50,
    save_strategy="steps",
    optim="paged_adamw_8bit",
    bf16=True,
    dataloader_drop_last=False,
    remove_unused_columns=True,
    report_to=[],
)

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    def _prepare_inputs(self, inputs):
        if isinstance(inputs, dict):
            inputs.pop('num_items_in_batch', None)
        return super()._prepare_inputs(inputs)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

print("Starting DoRA multi-label training...")
trainer.train()

print("Saving DoRA model...")
model.save_pretrained("./arabic_multilabel_dora_final")
tokenizer.save_pretrained("./arabic_multilabel_dora_final")

def predict_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.sigmoid(outputs.logits)
    
    # Parse predictions
    emotion_probs = predictions[0][:12]
    emotion_idx = torch.argmax(emotion_probs).item()
    
    offensive_prob = predictions[0][12].item()
    hate_prob = predictions[0][13].item()
    
    return {
        'emotion': emotion_id2label[emotion_idx],
        'emotion_confidence': emotion_probs[emotion_idx].item(),
        'offensive': "yes" if offensive_prob > 0.5 else "no",
        'offensive_confidence': offensive_prob,
        'hate': "hate" if hate_prob > 0.5 else "not_hate",
        'hate_confidence': hate_prob
    }

print("DoRA multi-label training completed!")

2025-07-24 18:50:30.253407: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753383030.422828      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753383030.479054      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of GemmaForSequenceClassification were not initialized from the model checkpoint at google/gemma-7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5960 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting DoRA multi-label training...


Step,Training Loss
10,3.1929
20,1.362
30,0.6831
40,0.4615
50,0.3988
60,0.38
70,0.4139
80,0.35
90,0.3056
100,0.3137


Saving DoRA model...
DoRA multi-label training completed!


In [5]:
text="""الوقت يمضي بلا انتظار."""
predict_text(text)

{'emotion': 'surprise',
 'emotion_confidence': 0.2436424344778061,
 'offensive': 'no',
 'offensive_confidence': 0.0002959570847451687,
 'hate': 'not_hate',
 'hate_confidence': 5.144220995134674e-05}

In [6]:
df=pd.read_csv('/kaggle/input/subtask2gemmatraintest/test.csv')
# df=df[:10]

In [7]:
import pandas as pd


# Initialize prediction columns
df['Emotion_Predicted'] = None
df['Offensive_Predicted'] = None
df['Hate_Predicted'] = None

for i in range(len(df)):
    print(i)
    text = df['text'][i]

    # Always convert to string, then strip
    if pd.notnull(text):
        clean_text = str(text).strip()
    else:
        clean_text = ""

    if clean_text:  # Check if it's not empty after conversion
        try:
            prediction = predict_text(clean_text)
            df.at[i, 'Emotion_Predicted'] = prediction['emotion']
            df.at[i, 'Offensive_Predicted'] = prediction['offensive']
            
            # Only set Hate prediction if Offensive is not 'no'
            if prediction['offensive'].lower() != 'no':
                df.at[i, 'Hate_Predicted'] = prediction['hate']
            else:
                df.at[i, 'Hate_Predicted'] = None
        except Exception as e:
            df.at[i, 'Emotion_Predicted'] = "not_applicable"
            df.at[i, 'Offensive_Predicted'] = "not_applicable"
            df.at[i, 'Hate_Predicted'] = None
    else:
        df.at[i, 'Emotion_Predicted'] = "not_applicable"
        df.at[i, 'Offensive_Predicted'] = "not_applicable"
        df.at[i, 'Hate_Predicted'] = None

# Keep only 'id' and the prediction columns
df = df[['id', 'Emotion_Predicted', 'Offensive_Predicted', 'Hate_Predicted']]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [8]:
df

Unnamed: 0,id,Emotion_Predicted,Offensive_Predicted,Hate_Predicted
0,7775,disgust,yes,not_hate
1,6675,anger,yes,not_hate
2,5493,anger,yes,not_hate
3,7191,love,no,
4,3206,optimism,no,
...,...,...,...,...
1273,4928,anger,yes,not_hate
1274,1057,anticipation,no,
1275,4047,disgust,yes,not_hate
1276,5041,anger,no,


In [9]:
df.to_csv('prediction.csv',index = False)
