In [1]:
# !uv pip install kagglehub more_itertools torch transformers

In [2]:
import pandas as pd
import os

def is_submission():
    return os.getenv("KAGGLE_IS_COMPETITION_RERUN")

path = f"/kaggle/input/jigsaw-agile-community-rules/{'test.csv' if is_submission() else 'train.csv'}"

In [3]:
data = pd.read_csv(path)
data.head(1)

Unnamed: 0,row_id,body,rule,subreddit,positive_example_1,positive_example_2,negative_example_1,negative_example_2
0,2029,NEW RAP GROUP 17. CHECK US OUT https://soundcl...,"No Advertising: Spam, referral links, unsolici...",hiphopheads,"Hey, guys, just wanted to drop in and invite y...",Cum Swallowing Hottie Katrina Kaif Cartoon Xvi...,SD Stream Eng - [Chelsea TV USA](http://soccer...,HD Streams: |[ENG HD Stoke vs Manchester Unite...


### Steps for initial solution

- Step - 1 Add a column for expanding the rule by describing the given rule and make it understandable along with the prompt.
- Step - 2 Each row updated with prompt with positive and negative examples. 
- Step - 3 Then finally setup system prompt by describing the nature of the tool and get the otuput as voilation or nonvoilation along with other information.
            

In [4]:
import kagglehub
import more_itertools
import pandas as pd
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

In [5]:
hugging_face_model_name="google/gemma-3/transformers/gemma-3-1b-it"
GEMMA_PATH = kagglehub.model_download(hugging_face_model_name)
processor = AutoTokenizer.from_pretrained(GEMMA_PATH)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(GEMMA_PATH).to(device)

2025-10-04 10:34:40.570836: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759574080.793871      83 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759574080.863936      83 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
def expand_rules(rule, model, processor, device):

    SYS_PROMPT_RULE = """
    You are an expert with understanding of community guidlines and related keywords.
    For a given community rule provided, you go through its keywords and exapand each keyword in details. 
    
    Output should only contain the expanded rules no explanatory text.
    """
    
    USER_PROMPT_RULE = f"""
    Expand below rule
    
    {rule}
    
    """   
    
    prompt_rule_text = (
        f"<start_of_turn>system\n{SYS_PROMPT_RULE}<end_of_turn>\n"
        f"<start_of_turn>user\n{USER_PROMPT_RULE}<end_of_turn>\n"
        f"<start_of_turn>model\n"
    )

    rule_inputs = processor(prompt_rule_text, return_tensors="pt").to(device)
    
    output_rules = model.generate(
        **rule_inputs,
        max_new_tokens=300,
        repetition_penalty=1,
        eos_token_id=processor.eos_token_id,
        pad_token_id=processor.eos_token_id,
        return_dict_in_generate=True
    )
    
    generated_tokens = output_rules.sequences[:, rule_inputs["input_ids"].shape[-1]:]
    decoded_text = processor.batch_decode(generated_tokens, skip_special_tokens=True)

    return decoded_text[0].split("--")[0]
        

In [7]:
import re
rule_df = data[['rule']].drop_duplicates().reset_index(drop=True)
rule_df['expanded_rule'] = [re.sub(r'\n{3,}', "", expand_rules(rule, model, processor, device)) for rule in rule_df.rule]

W1004 10:35:29.348000 83 torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode


In [8]:
data = data.merge(rule_df, on="rule", how="left")

In [None]:
{
        voilation: 1 or 0,
        probability: <between 0 and 1, higher the better.>,
        justification: <reason why specific voilation value was choosen.>
    
    }

In [38]:
def classify_comment_with_rule(data_row, model, processor, device):

    SYSTEM_PROMPT_CLASSIFICATION = """
    
    you are a community rule voilation inspector. 
    if you are give the rule with context and target comment along with negative and positive comments you can decide if the target comment is in voilation of the specific rule.
    you will use positive comment to understand how those comments voilates the rule and negative comments to understand why negative comments are not breaking the rules,
    you are required to answer only one of the 2 values, 0 or 1. 0 denotes "not voilation" and 1 denotes "voilation".
    
    output should be given between 0 or 1. and strictly float.
    
     

    
    """
    
    USER_PROMPT_CLASSIFICATION = f"""
    Classify the below comment based on the rule given, 
    Rule : {data_row.expanded_rule}
    comment = {data_row.body}
    
    below are the positive comment (voilation) and negative comment (non voilation) related to the comment, 
    use these commnet to better understand the above comment: 
    
    positive comment : {data_row.positive_example_1, data_row.positive_example_2}
    negative comment : {data_row.negative_example_1, data_row.negative_example_2}
    
    """
    
    prompt_classify_text = (
        f"<start_of_turn>system\n{SYSTEM_PROMPT_CLASSIFICATION}<end_of_turn>\n"
        f"<start_of_turn>user\n{USER_PROMPT_CLASSIFICATION}<end_of_turn>\n"
        f"<start_of_turn>model\n"
    )

    classifier_inputs = processor(prompt_classify_text, return_tensors="pt").to(device) 
    
    output_classify = model.generate(
        **classifier_inputs,
        max_new_tokens=1,
        repetition_penalty=1.3,
        eos_token_id=processor.eos_token_id,
        pad_token_id=processor.eos_token_id,
        return_dict_in_generate=True
    )
    
    generated_tokens = output_classify.sequences[:, classifier_inputs["input_ids"].shape[-1]:]
    decoded_text = processor.batch_decode(generated_tokens, skip_special_tokens=True)
    return decoded_text[0].split("--")[0]

In [39]:
columns = [classify_comment_with_rule(row, model, processor, device) for _, row in data.iterrows()]

In [42]:
df =  data[['row_id']].copy()
df['rule_violation'] = columns
df.to_csv("submission.csv", index=False)