In [20]:
import pandas as pd

# 1) Load your dataset
#    (If file is in /content, adjust path accordingly)
df = pd.read_csv("cyber_law_super_dataset[1].csv")

print("Shape of raw dataset:", df.shape)
print("\nColumns:", df.columns.tolist())

print("\nSample rows:")
display(df.head(5))

# 2) Basic cleaning
# Strip extra spaces from string columns
str_cols = ["scenario", "law", "section_number", "section_title",
            "section_text", "punishment", "tags", "intent_label"]

for c in str_cols:
    if c in df.columns:
        df[c] = df[c].astype(str).str.strip()

# Drop rows where scenario or intent is missing (safety)
df = df[(df["scenario"] != "") & (df["intent_label"] != "")]
df = df.reset_index(drop=True)

print("\nAfter cleaning, shape:", df.shape)

print("\nUnique intent labels:")
print(df["intent_label"].value_counts())

# 3) Build LAW TABLE (unique legal sections for retriever)

law_cols = ["law", "section_number", "section_title", "section_text", "punishment"]

df_laws = (
    df[law_cols]
    .drop_duplicates()
    .reset_index(drop=True)
)

print("\nUnique law sections:", df_laws.shape[0])
display(df_laws.head(10))

# 4) Save cleaned files

df.to_csv("cyber_dataset_clean.csv", index=False)
df_laws.to_csv("cyber_law_sections_unique.csv", index=False)

print("\nSaved:")
print(" - cyber_dataset_clean.csv  (for intent training etc.)")
print(" - cyber_law_sections_unique.csv  (for FAISS / retriever)")


Shape of raw dataset: (2000, 8)

Columns: ['scenario', 'law', 'section_number', 'section_title', 'section_text', 'punishment', 'tags', 'intent_label']

Sample rows:


Unnamed: 0,scenario,law,section_number,section_title,section_text,punishment,tags,intent_label
0,UPI payment fraud happened through PhonePe,IPC,354D,Cyber stalking,Repeated online contact causing fear.,3 years jail,cyber_fraud,CYBER_FRAUD
1,I received a phishing link pretending to be Sn...,IT Rules 2021,Rule 4,Significant social media rules,Grievance officer and takedown timelines.,,general_cyber_query,GENERAL_CYBER_QUERY
2,A scammer impersonated a Facebook officer and ...,IPC,509,Insulting modesty of a woman,Online obscene messages.,3 years jail,cyber_fraud,CYBER_FRAUD
3,A fake profile was created using my photos on ...,DPDP Act 2023,9,Data principal rights,"Right to access, correction, erase data.",,cyber_identity_theft,CYBER_IDENTITY_THEFT
4,A loan app leaked my contact list and threaten...,IT Act 2000,67A,Sexually explicit content,Publishing sexually explicit content online.,5 years jail + fine,cyber_privacy_violation,CYBER_PRIVACY_VIOLATION



After cleaning, shape: (2000, 8)

Unique intent labels:
intent_label
CYBER_FRAUD                628
GENERAL_CYBER_QUERY        470
CYBER_PRIVACY_VIOLATION    367
CYBER_HACKING              197
CYBER_IDENTITY_THEFT       177
CYBER_HARASSMENT           161
Name: count, dtype: int64

Unique law sections: 25


Unnamed: 0,law,section_number,section_title,section_text,punishment
0,IPC,354D,Cyber stalking,Repeated online contact causing fear.,3 years jail
1,IT Rules 2021,Rule 4,Significant social media rules,Grievance officer and takedown timelines.,
2,IPC,509,Insulting modesty of a woman,Online obscene messages.,3 years jail
3,DPDP Act 2023,9,Data principal rights,"Right to access, correction, erase data.",
4,IT Act 2000,67A,Sexually explicit content,Publishing sexually explicit content online.,5 years jail + fine
5,IPC,354A,Sexual harassment,Online sexual harassment.,3 years jail
6,IT Act 2000,66D,Cheating by personation using computer resources,Online impersonation to cheat.,3 years jail + fine
7,IT Rules 2021,Rule 7,Non-observance penalties,Intermediaries lose safe-harbor protections.,
8,IPC,471,Using forged digital documents,Using forged digital IDs.,7 years jail
9,IT Act 2000,66E,Violation of privacy,Capturing or transmitting private images.,3 years jail + fine



Saved:
 - cyber_dataset_clean.csv  (for intent training etc.)
 - cyber_law_sections_unique.csv  (for FAISS / retriever)


In [None]:

# 1. IMPORTS

import os
os.environ["WANDB_DISABLED"] = "true"   # disable wandb

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)


# 2. LOAD & CLEAN DATASET
# Required columns: scenario, intent_label

df = pd.read_csv("cyber_dataset_clean.csv")

# Remove ANY unwanted columns
remove_cols = [
    "__index_level_0__", "law", "section_title", "section_text",
    "tags", "section_number", "punishment", "label_id"
]
for col in remove_cols:
    if col in df.columns:
        df = df.drop(columns=[col])

# Keep only useful columns
df = df[["scenario", "intent_label"]]

# Save cleaned file
df.to_csv("cyber_intent_clean.csv", index=False)

print("CLEANED DATA:")
print(df.head())

# 3. LABEL ENCODING
le = LabelEncoder()
df["label"] = le.fit_transform(df["intent_label"])

df.to_csv("cyber_intent_final.csv", index=False)

label_names = list(le.classes_)
print("\nLABELS:", label_names)

# 4. PREPARE DATASET (HuggingFace Dataset)
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

train_ds = dataset["train"]
test_ds = dataset["test"]

# 5. TOKENIZER + MODEL
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

num_labels = len(label_names)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label={i: l for i, l in enumerate(label_names)},
    label2id={l: i for i, l in enumerate(label_names)},
)

# 6. TOKENIZATION FUNCTION

def tokenize(batch):
    return tokenizer(
        batch["scenario"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

train_tok = train_ds.map(tokenize, batched=True)
test_tok = test_ds.map(tokenize, batched=True)

# Remove unused columns
train_tok = train_tok.remove_columns(["scenario", "intent_label"])
test_tok = test_tok.remove_columns(["scenario", "intent_label"])

train_tok.set_format("torch")
test_tok.set_format("torch")

# 7. TRAINING ARGUMENTS
training_args = TrainingArguments(
    output_dir="cyber_intent_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=3,
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="epoch",
    remove_unused_columns=True,
)


# 8. TRAINER

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
)


# 9. TRAIN THE MODEL
trainer.train()

# Save everything
trainer.save_model("cyber_intent_model")
tokenizer.save_pretrained("cyber_intent_model")

print("\nðŸŽ‰ TRAINING COMPLETE!")
print("Model saved to: cyber_intent_model")


CLEANED DATA:
                                            scenario             intent_label
0         UPI payment fraud happened through PhonePe              CYBER_FRAUD
1  I received a phishing link pretending to be Sn...      GENERAL_CYBER_QUERY
2  A scammer impersonated a Facebook officer and ...              CYBER_FRAUD
3  A fake profile was created using my photos on ...     CYBER_IDENTITY_THEFT
4  A loan app leaked my contact list and threaten...  CYBER_PRIVACY_VIOLATION

LABELS: ['CYBER_FRAUD', 'CYBER_HACKING', 'CYBER_HARASSMENT', 'CYBER_IDENTITY_THEFT', 'CYBER_PRIVACY_VIOLATION', 'GENERAL_CYBER_QUERY']


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.032,0.01775
2,0.0106,0.006678
3,0.0079,0.005193





ðŸŽ‰ TRAINING COMPLETE!
Model saved to: cyber_intent_model
