In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-agile-community-rules/sample_submission.csv
/kaggle/input/jigsaw-agile-community-rules/train.csv
/kaggle/input/jigsaw-agile-community-rules/test.csv


In [8]:
!pip install -q transformers datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvid

In [9]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import torch


In [10]:
train = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/train.csv')
test = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')

print(f"Train shape: {train.shape}, Test shape: {test.shape}")


Train shape: (2029, 9), Test shape: (10, 8)


*Preprocess Text*

# Tokenizer & Model Name

In [11]:
def build_input_text(row):
    return (
        f"Rule: {row['rule']} "
        f"Positive Examples: {row['positive_example_1']} {row['positive_example_2']} "
        f"Negative Examples: {row['negative_example_1']} {row['negative_example_2']} "
        f"Comment: {row['body']}"
    )

train['input_text'] = train.apply(build_input_text, axis=1)
test['input_text'] = test.apply(build_input_text, axis=1)


In [12]:
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def encode(examples):
    return tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=256)


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



# Cross-Validation Setup

In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(train))
test_preds = np.zeros(len(test))


In [14]:
os.environ["WANDB_DISABLED"] = "true"
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./models",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none"  # Disable W&B
)


# . Training Loop for 5 Folds

In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train['rule_violation'])):
    print(f"***** Fold {fold+1} *****")
    
    train_fold = train.iloc[train_idx]
    val_fold = train.iloc[val_idx]
    
    # Convert to HF Dataset
    hf_train = Dataset.from_pandas(train_fold[['input_text', 'rule_violation']])
    hf_val = Dataset.from_pandas(val_fold[['input_text', 'rule_violation']])
    hf_test = Dataset.from_pandas(test[['input_text']])
    
    # Tokenize
    hf_train = hf_train.map(encode, batched=True)
    hf_val = hf_val.map(encode, batched=True)
    hf_test = hf_test.map(encode, batched=True)
    
    hf_train = hf_train.rename_column('rule_violation', 'labels')
    hf_val = hf_val.rename_column('rule_violation', 'labels')
    
    hf_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    hf_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    hf_test.set_format(type='torch', columns=['input_ids', 'attention_mask'])
    
    # Load Model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./fold_{fold+1}",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=50,
        save_total_limit=1
    )
    
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        probs = torch.softmax(torch.tensor(logits), dim=-1)[:, 1].numpy()
        auc = roc_auc_score(labels, probs)
        return {"roc_auc": auc}
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=hf_train,
        eval_dataset=hf_val,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    
    # Validation Predictions
    val_preds = trainer.predict(hf_val).predictions
    val_probs = torch.softmax(torch.tensor(val_preds), dim=-1)[:, 1].numpy()
    oof_preds[val_idx] = val_probs
    
    # Test Predictions
    test_preds_fold = trainer.predict(hf_test).predictions
    test_probs_fold = torch.softmax(torch.tensor(test_preds_fold), dim=-1)[:, 1].numpy()
    test_preds += test_probs_fold / 5


***** Fold 1 *****


Map:   0%|          | 0/1623 [00:00<?, ? examples/s]

Map:   0%|          | 0/406 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


# Out-of-Fold Score

In [None]:
print("OOF ROC-AUC:", roc_auc_score(train['rule_violation'], oof_preds))


# Submission

In [None]:
submission = pd.DataFrame({
    'row_id': test['row_id'],
    'rule_violation': test_preds
})
submission.to_csv('submission.csv', index=False)
print(submission.head())
