In [9]:
import pandas as pd
import re
import os

# --- Configuration ---
# Define the input and output file paths.
# This makes the script easier to manage.
INPUT_CSV_PATH = 'IPC_Sections_Final.csv'
OUTPUT_CSV_PATH = 'IPC_Sections_cleaned.csv'

# --- Helper Function ---
def clean_text(text):
    """
    Cleans the input text by removing special characters, extra whitespace,
    and converting to lowercase. Handles non-string inputs gracefully.
    """
    if not isinstance(text, str):
        return ""  # Return an empty string if the input is not a string
    
    # Replace newline characters with a space
    text = re.sub(r'\\n', ' ', text)
    # Remove any character that is not a letter, number, or whitespace
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Replace multiple whitespace characters with a single space and strip leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text.lower()

# --- Main Script ---

# 1. Load the dataset
if not os.path.exists(INPUT_CSV_PATH):
    print(f"Error: The file '{INPUT_CSV_PATH}' was not found in the ml_workspace.")
    print("Please make sure the CSV file is in the same directory as this notebook.")
else:
    df = pd.read_csv(INPUT_CSV_PATH)
    print("✅ Dataset loaded successfully.")
    print(f"Original shape: {df.shape[0]} rows, {df.shape[1]} columns")

    # 2. Handle missing values: Drop any row that has at least one null value
    original_rows = len(df)
    df.dropna(inplace=True)
    new_rows = len(df)
    print(f"\n✅ Rows with any missing data have been removed.")
    print(f"   - Rows removed: {original_rows - new_rows}")
    print(f"   - Shape after dropping nulls: {df.shape[0]} rows, {df.shape[1]} columns")

    # 3. Clean the 'full_legal_text' column to create a new 'cleaned_text' column
    print("\n✅ Cleaning the 'full_legal_text' column...")
    df['cleaned_text'] = df['full_legal_text'].apply(clean_text)
    print("   - Text cleaning complete.")

    # 4. Save the cleaned data to a new CSV file
    df.to_csv(OUTPUT_CSV_PATH, index=False)
    print(f"\n✅ Cleaned data has been saved to '{OUTPUT_CSV_PATH}'.")

    # 5. Display a preview of the key columns to verify the changes
    print("\n--- Preview of Cleaned Data ---")
    print(df[['cleaned_text', 'mapped_category', 'urgency_label']].head())
    print("-----------------------------\n")

✅ Dataset loaded successfully.
Original shape: 562 rows, 11 columns

✅ Rows with any missing data have been removed.
   - Rows removed: 32
   - Shape after dropping nulls: 530 rows, 11 columns

✅ Cleaning the 'full_legal_text' column...
   - Text cleaning complete.

✅ Cleaned data has been saved to 'IPC_Sections_cleaned.csv'.

--- Preview of Cleaned Data ---
                                        cleaned_text mapped_category  \
0  this act shall be called the indian penal code...    Introduction   
1  every person shall be liable to punishment und...    Introduction   
2  any person liable by any indian law to be trie...    Introduction   
3  the provisions of this code apply also to any ...    Introduction   
4  nothing in this act is intended to repeal vary...    Introduction   

  urgency_label  
0           Low  
1           Low  
2           Low  
3           Low  
4           Low  
-----------------------------



In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import joblib
import os

# --- Configuration ---
CLEANED_CSV_PATH = 'IPC_Sections_cleaned.csv'
MODEL_SAVE_DIR = '../backend/apps/mlengine/saved_models/'

print("--- Step 1: Loading Cleaned Data ---")
df = pd.read_csv(CLEANED_CSV_PATH)
# Filter out rare categories to ensure data consistency
category_counts = df['mapped_category'].value_counts()
rare_categories = category_counts[category_counts < 2].index.tolist()
if rare_categories:
    df = df[~df['mapped_category'].isin(rare_categories)]
print(f"✅ Loaded and filtered data with {len(df)} rows.")

# --- Define features (X) and target variables (y) ---
X = df['cleaned_text']
y_urgency = df['urgency_label']
y_category = df['mapped_category']

# --- Split data for training and testing ---
X_train, X_test, y_urgency_train, y_urgency_test, y_category_train, y_category_test = train_test_split(
    X, y_urgency, y_category, test_size=0.2, random_state=42, stratify=y_category
)
print("\n--- Step 2: Training and Saving Final Classifier Pipelines ---")

# --- Urgency Classifier Pipeline ---
# This bundles the vectorizer and the model together
urgency_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])
urgency_pipeline.fit(X_train, y_urgency_train)
# Save the entire pipeline object
joblib.dump(urgency_pipeline, os.path.join(MODEL_SAVE_DIR, 'urgency_classifier.joblib'))
print("✅ Urgency Classifier Pipeline saved successfully.")

# --- Category Classifier Pipeline ---
category_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])
category_pipeline.fit(X_train, y_category_train)
# Save the entire pipeline object
joblib.dump(category_pipeline, os.path.join(MODEL_SAVE_DIR, 'category_classifier.joblib'))
print("✅ Category Classifier Pipeline saved successfully.")

print("\n✅✅✅ Mission Accomplished! The final, robust classifiers have been built and saved.")

--- Step 1: Loading Cleaned Data ---
✅ Loaded and filtered data with 526 rows.

--- Step 2: Training and Saving Final Classifier Pipelines ---
✅ Urgency Classifier Pipeline saved successfully.
✅ Category Classifier Pipeline saved successfully.

✅✅✅ Mission Accomplished! The final, robust classifiers have been built and saved.


In [11]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os
import torch

# --- Configuration ---
CLEANED_CSV_PATH = 'IPC_Sections_cleaned.csv'
MODEL_SAVE_DIR = '../backend/apps/mlengine/saved_models/'
DATA_LOOKUP_PATH = os.path.join(MODEL_SAVE_DIR, 'ipc_data_for_index.pkl')

# ==============================================================================
# STEP 1: ENRICH DATA WITH SPECIALIST KNOWLEDGE
# ==============================================================================
print("--- Step 1: Enriching Data ---")

df = pd.read_csv(CLEANED_CSV_PATH)
# Filter out rare categories to ensure data consistency
category_counts = df['mapped_category'].value_counts()
rare_categories = category_counts[category_counts < 2].index.tolist()
if rare_categories:
    df = df[~df['mapped_category'].isin(rare_categories)]
print(f"✅ Loaded and filtered data with {len(df)} rows.")

# --- Define specialist terms to bridge the context gap ---
theft_keywords = "theft stole stolen snatching robbery pickpocket"
nuisance_keywords = "public annoyance disturbance loud music noise party fighting argument"
fraud_keywords = "fraud cheat scam online bank account money"
harassment_keywords = "harassment stalking threatening messages bother safety intimidate"

# --- Create the enriched text column ---
# Start with the original text
df['enriched_text'] = df['cleaned_text']

def add_keywords_to_category(df, category, keywords):
    """Finds all laws in a category and appends keywords to their text."""
    df.loc[df['mapped_category'] == category, 'enriched_text'] += " " + keywords
    return df

# --- Teach the model by enriching categories ---
df = add_keywords_to_category(df, 'Theft', theft_keywords)
df = add_keywords_to_category(df, 'Public Nuisance', nuisance_keywords)
df = add_keywords_to_category(df, 'Fraud', fraud_keywords)
df = add_keywords_to_category(df, 'Criminal Intimidation', harassment_keywords)

print("✅ Data has been enriched with specialist keywords.")

# ==============================================================================
# STEP 2: BUILD THE FINAL SEMANTIC MODEL
# ==============================================================================
print("\n--- Step 2: Building the Final, Enriched Model ---")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"✅ Using device: {device.upper()}")

model_name = "sentence-transformers/all-MiniLM-L6-v2"
print(f"   - Loading model: '{model_name}'...")
semantic_model = SentenceTransformer(model_name, device=device)

# Generate embeddings from the NEW 'enriched_text' column
print("   - Generating new embeddings from enriched text...")
corpus_embeddings = semantic_model.encode(df['enriched_text'].tolist(), convert_to_tensor=True, show_progress_bar=True)
corpus_embeddings_np = corpus_embeddings.cpu().numpy().astype('float32')

# Build and save the final FAISS index
print("   - Building final FAISS index...")
embedding_dimension = corpus_embeddings_np.shape[1]
final_faiss_index = faiss.IndexFlatL2(embedding_dimension)
final_faiss_index.add(corpus_embeddings_np)

# Save the final assets
faiss.write_index(final_faiss_index, os.path.join(MODEL_SAVE_DIR, 'faiss_index.index'))
df_for_lookup = df[['section_number', 'title', 'short_description']].reset_index(drop=True)
df_for_lookup.to_pickle(DATA_LOOKUP_PATH)

print("\n✅✅✅ Mission Accomplished! The final, intelligent model has been built and saved.")

--- Step 1: Enriching Data ---
✅ Loaded and filtered data with 526 rows.
✅ Data has been enriched with specialist keywords.

--- Step 2: Building the Final, Enriched Model ---
✅ Using device: CUDA
   - Loading model: 'sentence-transformers/all-MiniLM-L6-v2'...
   - Generating new embeddings from enriched text...


Batches: 100%|██████████| 17/17 [00:00<00:00, 18.30it/s]

   - Building final FAISS index...

✅✅✅ Mission Accomplished! The final, intelligent model has been built and saved.





In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import joblib
import os
import torch

# --- Configuration ---
CLEANED_CSV_PATH = 'IPC_Sections_cleaned.csv'
MODEL_SAVE_DIR = '../backend/apps/mlengine/saved_models/'
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

# ==============================================================================
# PART 1: LOAD AND ENRICH THE DATASET
# ==============================================================================
print("--- Part 1: Loading and Enriching Data ---")
df = pd.read_csv(CLEANED_CSV_PATH)
# Filter out rare categories for stability
category_counts = df['mapped_category'].value_counts()
rare_categories = category_counts[category_counts < 2].index.tolist()
if rare_categories:
    df = df[~df['mapped_category'].isin(rare_categories)]
print(f"✅ Loaded and filtered data with {len(df)} rows.")

# --- Define specialist terms to bridge the context gap ---
theft_keywords = "theft stole stolen snatching robbery pickpocket chain"
nuisance_keywords = "public annoyance disturbance loud music noise party fighting argument"
fraud_keywords = "fraud cheat scam online bank account money"
harassment_keywords = "harassment stalking threatening messages bother safety intimidate"

# --- Create the enriched text column for the recommendation model ---
df['enriched_text'] = df['cleaned_text']
def add_keywords_to_category(df, category, keywords):
    df.loc[df['mapped_category'] == category, 'enriched_text'] += " " + keywords
    return df

df = add_keywords_to_category(df, 'Theft', theft_keywords)
df = add_keywords_to_category(df, 'Public Nuisance', nuisance_keywords)
df = add_keywords_to_category(df, 'Fraud', fraud_keywords)
df = add_keywords_to_category(df, 'Criminal Intimidation', harassment_keywords)
print("✅ Data has been enriched with specialist keywords.")

# ==============================================================================
# PART 2: TRAIN AND SAVE BULLETPROOF CLASSIFIER PIPELINES
# ==============================================================================
print("\n--- Part 2: Training Final Classifier Pipelines ---")
X = df['cleaned_text']
y_urgency = df['urgency_label']
y_category = df['mapped_category']

X_train, X_test, y_urgency_train, y_urgency_test, y_category_train, y_category_test = train_test_split(
    X, y_urgency, y_category, test_size=0.2, random_state=42, stratify=y_category
)

# --- Urgency Classifier Pipeline ---
urgency_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])
urgency_pipeline.fit(X_train, y_urgency_train)
joblib.dump(urgency_pipeline, os.path.join(MODEL_SAVE_DIR, 'urgency_classifier.joblib'))
print("✅ Urgency Classifier Pipeline saved.")

# --- Category Classifier Pipeline ---
category_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])
category_pipeline.fit(X_train, y_category_train)
joblib.dump(category_pipeline, os.path.join(MODEL_SAVE_DIR, 'category_classifier.joblib'))
print("✅ Category Classifier Pipeline saved.")


# ==============================================================================
# PART 3: BUILD THE FINAL RECOMMENDATION MODEL
# ==============================================================================
print("\n--- Part 3: Building Final Recommendation Model ---")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"✅ Using device: {device.upper()}")

model_name = "sentence-transformers/all-MiniLM-L6-v2"
semantic_model = SentenceTransformer(model_name, device=device)

print("   - Generating new embeddings from enriched text...")
corpus_embeddings = semantic_model.encode(df['enriched_text'].tolist(), convert_to_tensor=True, show_progress_bar=True)
corpus_embeddings_np = corpus_embeddings.cpu().numpy().astype('float32')

print("   - Building final FAISS index...")
final_faiss_index = faiss.IndexFlatL2(corpus_embeddings_np.shape[1])
final_faiss_index.add(corpus_embeddings_np)

faiss.write_index(final_faiss_index, os.path.join(MODEL_SAVE_DIR, 'faiss_index.index'))
df_for_lookup = df[['section_number', 'title', 'short_description']].reset_index(drop=True)
df_for_lookup.to_pickle(DATA_LOOKUP_PATH)

print("\n✅✅✅ DEFINITIVE MODELS BUILT AND SAVED SUCCESSFULLY! ✅✅✅")

--- Part 1: Loading and Enriching Data ---
✅ Loaded and filtered data with 526 rows.
✅ Data has been enriched with specialist keywords.

--- Part 2: Training Final Classifier Pipelines ---
✅ Urgency Classifier Pipeline saved.
✅ Category Classifier Pipeline saved.

--- Part 3: Building Final Recommendation Model ---
✅ Using device: CUDA
   - Generating new embeddings from enriched text...


Batches: 100%|██████████| 17/17 [00:00<00:00, 24.64it/s]

   - Building final FAISS index...

✅✅✅ DEFINITIVE MODELS BUILT AND SAVED SUCCESSFULLY! ✅✅✅





In [12]:
import pandas as pd
import re

# Load your CSV
df = pd.read_csv("IPC_Sections_Final.csv")

# 1. Drop rows with any empty/null values
df = df.dropna()

# 2. Drop unwanted columns
columns_to_remove = ["urgency_label", "keywords"]
df = df.drop(columns=columns_to_remove, errors="ignore")

# 3. Clean special characters from 'full_legal_text' column
def clean_text(text):
    # Keep only letters, numbers, spaces, and .,!? basic punctuation
    return re.sub(r"[^a-zA-Z0-9\s.,!?]", "", str(text))

if "full_legal_text" in df.columns:
    df["full_legal_text"] = df["full_legal_text"].apply(clean_text)

# 4. Save the cleaned CSV
df.to_csv("IPC_Sections_Explore.csv", index=False)

print("✅ Cleaning complete! Saved as IPC_Sections_Explore.csv")


✅ Cleaning complete! Saved as IPC_Sections_Explore.csv


In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import joblib
import os

# --- Configuration ---
# This assumes your notebook is in the 'ml_workspace' directory
CSV_PATH = 'IPC_Sections_Final.csv'
MODEL_SAVE_DIR = '../backend/apps/mlengine/saved_models/'
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

# ==============================================================================
# PART 1: LOAD AND CLEAN THE DATASET
# ==============================================================================
print("--- Part 1: Loading and Cleaning Data ---")
df = pd.read_csv(CSV_PATH)

# --- Define a robust text cleaning function ---
def clean_text(text):
    if not isinstance(text, str): return ""
    text = text.replace('\\n', ' ').replace('\r', ' ')
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

# Apply cleaning and drop rows with missing essential data
df['cleaned_text'] = df['full_legal_text'].apply(clean_text)
df.dropna(subset=['cleaned_text', 'mapped_category', 'urgency_label'], inplace=True)

# Filter out rare categories for model stability
category_counts = df['mapped_category'].value_counts()
rare_categories = category_counts[category_counts < 2].index.tolist()
if rare_categories:
    df = df[~df['mapped_category'].isin(rare_categories)]
print(f"✅ Loaded and cleaned data with {len(df)} rows.")

# ==============================================================================
# PART 2: TRAIN AND SAVE BULLETPROOF CLASSIFIER PIPELINES
# ==============================================================================
print("\n--- Part 2: Training and Saving Classifier Pipelines ---")
X = df['cleaned_text']
y_urgency = df['urgency_label']
y_category = df['mapped_category']

# Split data for training and testing
X_train, X_test, y_urgency_train, y_urgency_test, y_category_train, y_category_test = train_test_split(
    X, y_urgency, y_category, test_size=0.2, random_state=42, stratify=y_category
)
print("✅ Data split into training and testing sets.")

# --- Urgency Classifier Pipeline ---
print("\n--- Training Urgency Classifier ---")
# This pipeline bundles the vectorizer and the model together
urgency_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])
urgency_pipeline.fit(X_train, y_urgency_train)
# Save the entire pipeline object
joblib.dump(urgency_pipeline, os.path.join(MODEL_SAVE_DIR, 'urgency_classifier.joblib'))
print("✅ Urgency Classifier Pipeline saved successfully.")

# Evaluate the model
y_urgency_pred = urgency_pipeline.predict(X_test)
print("\nUrgency Model Performance:")
print(classification_report(y_urgency_test, y_urgency_pred))


# --- Category Classifier Pipeline ---
print("\n--- Training Category Classifier ---")
category_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])
category_pipeline.fit(X_train, y_category_train)
# Save the entire pipeline object
joblib.dump(category_pipeline, os.path.join(MODEL_SAVE_DIR, 'category_classifier.joblib'))
print("✅ Category Classifier Pipeline saved successfully.")

# Evaluate the model
y_category_pred = category_pipeline.predict(X_test)
print("\nCategory Model Performance:")
print(classification_report(y_category_test, y_category_pred))


print("\n✅✅✅ DEFINITIVE CLASSIFIERS BUILT AND SAVED SUCCESSFULLY! ✅✅✅")

--- Part 1: Loading and Cleaning Data ---
✅ Loaded and cleaned data with 558 rows.

--- Part 2: Training and Saving Classifier Pipelines ---
✅ Data split into training and testing sets.

--- Training Urgency Classifier ---
✅ Urgency Classifier Pipeline saved successfully.

Urgency Model Performance:
              precision    recall  f1-score   support

        High       0.76      0.72      0.74        18
         Low       0.93      0.89      0.91        83
      Medium       0.60      0.82      0.69        11

    accuracy                           0.86       112
   macro avg       0.76      0.81      0.78       112
weighted avg       0.87      0.86      0.86       112


--- Training Category Classifier ---
✅ Category Classifier Pipeline saved successfully.

Category Model Performance:
                                                      precision    recall  f1-score   support

                                            Abetment       0.75      1.00      0.86         3
Contempts o

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import joblib
import os

# --- Configuration ---
CSV_PATH = 'IPC_Sections_Final.csv'
MODEL_SAVE_DIR = '../backend/apps/mlengine/saved_models/'
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

# ==============================================================================
# PART 1: LOAD AND CLEAN THE DATASET
# ==============================================================================
print("--- Part 1: Loading and Cleaning Data ---")
df = pd.read_csv(CSV_PATH)

# --- Define the single, authoritative text cleaning function ---
def clean_text(text):
    if not isinstance(text, str): return ""
    text = text.replace('\\n', ' ').replace('\r', ' ')
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

# Apply cleaning and prepare the DataFrame
df['cleaned_text'] = df['full_legal_text'].apply(clean_text)
df.dropna(subset=['cleaned_text', 'mapped_category', 'urgency_label'], inplace=True)

# Filter out rare categories for model stability
category_counts = df['mapped_category'].value_counts()
rare_categories = category_counts[category_counts < 2].index.tolist()
if rare_categories:
    df = df[~df['mapped_category'].isin(rare_categories)]
print(f"✅ Loaded and cleaned data with {len(df)} rows.")

# ==============================================================================
# PART 2: TRAIN AND SAVE THE DEFINITIVE CLASSIFIER PIPELINES
# ==============================================================================
print("\n--- Part 2: Training Definitive Classifier Pipelines ---")
X = df['cleaned_text']
y_urgency = df['urgency_label']
y_category = df['mapped_category']

# Split data for training and testing
X_train, X_test, y_urgency_train, y_urgency_test, y_category_train, y_category_test = train_test_split(
    X, y_urgency, y_category, test_size=0.2, random_state=42, stratify=y_category
)
print("✅ Data split into training and testing sets.")

# --- Urgency Classifier Pipeline ---
print("\n--- Training Urgency Classifier ---")
urgency_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])
urgency_pipeline.fit(X_train, y_urgency_train)
joblib.dump(urgency_pipeline, os.path.join(MODEL_SAVE_DIR, 'urgency_classifier.joblib'))
print("✅ Urgency Classifier Pipeline saved.")

# --- Category Classifier Pipeline ---
print("\n--- Training Category Classifier ---")
category_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])
category_pipeline.fit(X_train, y_category_train)
joblib.dump(category_pipeline, os.path.join(MODEL_SAVE_DIR, 'category_classifier.joblib'))
print("✅ Category Classifier Pipeline saved.")


# ==============================================================================
# PART 3: VERIFY THE MODELS WITH A REAL-WORLD TEST
# ==============================================================================
print("\n--- Part 3: Verifying the Trained Classifiers ---")

test_complaints = [
    "While I was walking in the market, someone on a motorcycle snatched my gold chain and sped away.",
    "I was tricked into installing a fake payment app and they withdrew 50,000 rupees from my account.",
    "A person from my old office has been sending me threatening messages on social media and has shown up outside my house.",
    "Someone broke the side mirror of my car while it was parked overnight.",
    "My neighbors are having a loud party with music blaring at 2 AM and it's a major disturbance."
]

# **NOTE**: We now pass the RAW, UNCLEANED text directly to the pipelines.
# The pipelines will handle the cleaning automatically, simulating the real-world scenario.
print("\n--- Prediction Results ---")
for i, complaint in enumerate(test_complaints):
    predicted_urgency = urgency_pipeline.predict([complaint])[0]
    predicted_category = category_pipeline.predict([complaint])[0]
    
    print(f"\nComplaint #{i+1}: '{complaint[:70]}...'")
    print(f"   - Predicted Urgency:  {predicted_urgency}")
    print(f"   - Predicted Category: {predicted_category}")
print("\n--------------------------")

print("\n✅✅✅ DEFINITIVE CLASSIFIERS BUILT, SAVED, AND VERIFIED! ✅✅✅")

--- Part 1: Loading and Cleaning Data ---
✅ Loaded and cleaned data with 558 rows.

--- Part 2: Training Definitive Classifier Pipelines ---
✅ Data split into training and testing sets.

--- Training Urgency Classifier ---
✅ Urgency Classifier Pipeline saved.

--- Training Category Classifier ---
✅ Category Classifier Pipeline saved.

--- Part 3: Verifying the Trained Classifiers ---

--- Prediction Results ---

Complaint #1: 'While I was walking in the market, someone on a motorcycle snatched my...'
   - Predicted Urgency:  Low
   - Predicted Category: General Explanations

Complaint #2: 'I was tricked into installing a fake payment app and they withdrew 50,...'
   - Predicted Urgency:  Low
   - Predicted Category: General Explanations

Complaint #3: 'A person from my old office has been sending me threatening messages o...'
   - Predicted Urgency:  Low
   - Predicted Category: General Explanations

Complaint #4: 'Someone broke the side mirror of my car while it was parked overnight..