In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from fpdf import FPDF
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from torch.cuda.amp import autocast, GradScaler


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')


df = pd.read_csv("harassment_dataset_large.csv")

label_mapping = {label: idx for idx, label in enumerate(df["label"].unique())}
df["label_encoded"] = df["label"].map(label_mapping)


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(tokens)

df['cleaned_text'] = df['text'].apply(preprocess_text)


train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['cleaned_text'].tolist(), df['label_encoded'].tolist(), test_size=0.2, random_state=42
)

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")


class HarassmentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = HarassmentDataset(train_texts, train_labels, tokenizer)
val_dataset = HarassmentDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, pin_memory=True)

num_labels = len(df["label"].unique())
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


optimizer = Adam(model.parameters(), lr=2e-5)
scaler = GradScaler()


def train_model(model, train_loader, val_loader, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for step, batch in enumerate(train_loader):
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            with autocast():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item()}")

        print(f"Epoch {epoch+1}: Avg Loss {total_loss / len(train_loader)}")

train_model(model, train_loader, val_loader)


ipc_mapping = dict(zip(df["label_encoded"], df["ipc_section"]))

test_text = "Send me the money or I'll expose your secrets!"
pdf_file = generate_legal_document(test_text)
print(f"Legal complaint saved as {pdf_file}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()
  with autocast():  # Mixed precision for speed


Epoch 1, Step 0, Loss: 1.78497314453125
Epoch 1, Step 10, Loss: 1.72332763671875
Epoch 1, Step 20, Loss: 1.49578857421875
Epoch 1, Step 30, Loss: 1.208343505859375
Epoch 1, Step 40, Loss: 1.11883544921875
Epoch 1, Step 50, Loss: 0.584503173828125
Epoch 1, Step 60, Loss: 0.4856414794921875
Epoch 1, Step 70, Loss: 0.3196868896484375
Epoch 1, Step 80, Loss: 0.1832427978515625
Epoch 1, Step 90, Loss: 0.15970230102539062
Epoch 1, Step 100, Loss: 0.1082763671875
Epoch 1, Step 110, Loss: 0.1002960205078125
Epoch 1, Step 120, Loss: 0.07943153381347656
Epoch 1, Step 130, Loss: 0.06897163391113281
Epoch 1, Step 140, Loss: 0.0562896728515625
Epoch 1, Step 150, Loss: 0.0464019775390625
Epoch 1, Step 160, Loss: 0.04709053039550781
Epoch 1, Step 170, Loss: 0.03851890563964844
Epoch 1, Step 180, Loss: 0.036380767822265625
Epoch 1, Step 190, Loss: 0.03704547882080078
Epoch 1, Step 200, Loss: 0.03231334686279297
Epoch 1, Step 210, Loss: 0.028351783752441406
Epoch 1, Step 220, Loss: 0.024602890014648438

TypeError: generate_legal_document() missing 4 required positional arguments: 'tokenizer', 'text', 'complainant_info', and 'device'

In [None]:

IPC_DETAILS = {
    "384": {
        "title": "Extortion",
        "definition": "Whoever intentionally puts any person in fear of any injury to that person or any other, and thereby dishonestly induces the person so put in fear to deliver to any person any property or valuable security.",
        "punishment": "Imprisonment up to 3 years, or fine, or both.",
        "remedies": [
            "Immediate FIR registration",
            "Preservation of threat evidence",
            "Protection order from magistrate"
        ]
    },
    "354": {
        "title": "Assault or Criminal Force to Woman with Intent to Outrage her Modesty",
        "definition": "Whoever assaults or uses criminal force to any woman, intending to outrage or knowing it to be likely that he will thereby outrage her modesty.",
        "punishment": "Imprisonment up to 2 years, or fine, or both.",
        "remedies": [
            "Medical examination",
            "Statement recording by woman officer",
            "Protection order"
        ]
    },
    "354A": {
        "title": "Sexual Harassment",
        "definition": "Unwelcome physical contact and advances or demand or request for sexual favors or showing pornography against the will of a woman or making sexually colored remarks.",
        "punishment": "Up to 3 years imprisonment, or fine, or both.",
        "remedies": [
            "Complaint to Internal Committee",
            "Preserve digital evidence",
            "Right to transfer perpetrator"
        ]
    },
    "354D": {
        "title": "Stalking",
        "definition": "Following a woman or contacting or attempting to contact her despite clear indication of disinterest by her.",
        "punishment": "Up to 3 years imprisonment (first offense), up to 5 years (repeat offense).",
        "remedies": [
            "Restraining order",
            "Preserve call/SMS logs",
            "Cyber cell complaint"
        ]
    },
    "506": {
        "title": "Criminal Intimidation",
        "definition": "Whoever threatens another with injury to person, reputation or property with intent to cause alarm.",
        "punishment": "Up to 2 years imprisonment, or fine, or both.",
        "remedies": [
            "Preserve threat evidence",
            "Protection order",
            "Weapon seizure if applicable"
        ]
    },
    "509": {
        "title": "Word, Gesture or Act Intended to Insult Modesty",
        "definition": "Whoever intentionally uses words, sounds or gestures to intrude upon the privacy of a woman.",
        "punishment": "Simple imprisonment up to 1 year, or fine, or both.",
        "remedies": [
            "Audio/video evidence preservation",
            "Witness statements",
            "Community protection order"
        ]
    }
}

def predict_ipc_section(model, tokenizer, text, device):
    """Predict IPC section using the trained model"""

    cleaned_text = preprocess_text(text)


    encoding = tokenizer(
        cleaned_text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)


    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1).item()

    return ipc_mapping.get(prediction, "506")

def generate_legal_document(model, tokenizer, text, complainant_info, device):
    """Generate a professional legal complaint PDF"""

    ipc_section = predict_ipc_section(model, tokenizer, text, device)
    ipc_info = IPC_DETAILS.get(ipc_section, IPC_DETAILS["506"])


    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Times", size=12)


    pdf.cell(0, 10, "To,", 0, 1)
    pdf.cell(0, 7, "The Station House Officer", 0, 1)
    pdf.cell(0, 7, complainant_info.get("police_station", "[Local Police Station Name]"), 0, 1)
    pdf.cell(0, 7, f"{complainant_info.get('city', '[City]')}, {complainant_info.get('state', '[State]')}", 0, 1)
    pdf.cell(0, 7, f"PIN: {complainant_info.get('pin_code', '[PIN]')}", 0, 1)
    pdf.ln(10)


    pdf.set_font("Times", 'B', 14)
    pdf.cell(0, 10, f"Subject: Complaint under IPC Section {ipc_section} ({ipc_info['title']})", 0, 1)
    pdf.set_font("Times", size=12)
    pdf.ln(10)


    pdf.multi_cell(0, 7, "Respected Madam/Sir,", 0, 1)
    pdf.ln(5)


    intro = f"I, {complainant_info['name']}, resident of {complainant_info['address']}, hereby lodge this formal complaint regarding the following incident:"
    pdf.multi_cell(0, 7, intro, 0, 1)
    pdf.ln(5)


    pdf.set_font("Times", 'B', 12)
    pdf.cell(0, 10, "1. Incident Details:", 0, 1)
    pdf.set_font("Times", size=12)

    incident_time = complainant_info.get('incident_datetime', datetime.now().strftime("%d/%m/%Y %H:%M"))
    pdf.multi_cell(0, 7, f"On {incident_time}, at {complainant_info.get('location', '[Location]')}:", 0, 1)
    pdf.multi_cell(0, 7, f'"{text}"', 0, 1)
    pdf.ln(3)


    details = [
        f"Perpetrator: {complainant_info.get('perpetrator', '[Unknown]')}",
        f"Witnesses: {complainant_info.get('witnesses', 'None')}",
        f"Evidence: {complainant_info.get('evidence', 'Available upon request')}"
    ]
    for detail in details:
        pdf.multi_cell(0, 7, detail, 0, 1)
    pdf.ln(5)


    pdf.set_font("Times", 'B', 12)
    pdf.cell(0, 10, "2. Legal Provisions:", 0, 1)
    pdf.set_font("Times", size=12)

    legal_texts = [
        f"IPC Section {ipc_section}: {ipc_info['title']}",
        f"Definition: {ipc_info['definition']}",
        f"Punishment: {ipc_info['punishment']}"
    ]
    for legal_text in legal_texts:
        pdf.multi_cell(0, 7, legal_text, 0, 1)
    pdf.ln(5)


    pdf.set_font("Times", 'B', 12)
    pdf.cell(0, 10, "3. Requested Actions:", 0, 1)
    pdf.set_font("Times", size=12)

    for remedy in ipc_info['remedies']:
        pdf.multi_cell(0, 7, f"- {remedy}", 0, 1)
    pdf.ln(5)


    pdf.set_font("Times", 'B', 12)
    pdf.cell(0, 10, "4. Declaration:", 0, 1)
    pdf.set_font("Times", size=12)
    pdf.multi_cell(0, 7, "I solemnly declare that the information provided above is true to the best of my knowledge and belief. I understand that false complaints are punishable under law.", 0, 1)
    pdf.ln(10)


    pdf.cell(0, 7, f"Date: {datetime.now().strftime('%d/%m/%Y')}", 0, 1)
    pdf.cell(0, 7, f"Place: {complainant_info.get('city', '[City]')}", 0, 1)
    pdf.ln(15)

    pdf.cell(0, 7, "Yours faithfully,", 0, 1)
    pdf.ln(10)
    pdf.set_font("Times", 'B', 12)
    pdf.cell(0, 7, complainant_info['name'], 0, 1)
    pdf.set_font("Times", size=12)
    pdf.multi_cell(0, 7, complainant_info['address'], 0, 1)
    pdf.cell(0, 7, f"Contact: {complainant_info['contact']}", 0, 1)


    filename = f"Complaint_IPC_{ipc_section}_{datetime.now().strftime('%Y%m%d_%H%M')}.pdf"
    pdf.output(filename)

    return filename

# Example Usage
if __name__ == "__main__":

    test_text = "My boss has been making inappropriate sexual comments and threatened to fire me when I objected"

    complainant_info = {
        "name": "Priya Sharma",
        "address": "A-204, Sunshine Apartments, Mumbai - 400001",
        "contact": "Phone: 9876543210 | Email: priya@example.com",
        "police_station": "Women's Police Station",
        "city": "Mumbai",
        "state": "Maharashtra",
        "pin_code": "400001",
        "incident_datetime": "29/03/2025 14:30",
        "location": "XYZ Company Office, Mumbai",
        "perpetrator": "Mr. Ajay Malhotra (Manager at XYZ Company)",
        "witnesses": "Ms. Meena (Receptionist), Office CCTV",
        "evidence": "WhatsApp messages, voice recordings"
    }

    pdf_file = generate_legal_document(model, tokenizer, test_text, complainant_info, device)
    print(f"Legal complaint saved as {pdf_file}")

Legal complaint saved as Complaint_IPC_IPC 503_20250329_1804.pdf
