# MuRIL-Based Intent Classifier for UPSIDA Complaints

This notebook trains a multilingual intent classifier using MuRIL to handle English, Hindi, Hinglish, and code-mixed complaints. It includes evaluation and a custom inference function matching the required JSON output format.

In [1]:
!pip install -q transformers datasets scikit-learn pandas

In [2]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load CSV
df = pd.read_csv("upsida_multilingual_complaints_500.csv")

Saving upsida_multilingual_complaints_500.csv to upsida_multilingual_complaints_500 (2).csv


In [3]:
import pandas as pd
import re

# Load your CSV (after uploading to Colab or Kaggle)
df = pd.read_csv("upsida_multilingual_complaints_500.csv")

# Clean the text using traditional NLP
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\u0900-\u097F\s]', '', text)  # remove special chars except Devanagari
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["complaint_text"] = df["complaint_text"].apply(clean_text)

# Label encoding
label2id = {label: idx for idx, label in enumerate(df["intent_label"].unique())}
id2label = {v: k for k, v in label2id.items()}
df["label"] = df["intent_label"].map(label2id)

In [4]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer

# No stratify to test real-world generalization
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df[["complaint_text", "label"]])
test_dataset = Dataset.from_pandas(test_df[["complaint_text", "label"]])

# Load tokenizer
model_name = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize
def tokenize(batch):
    return tokenizer(batch["complaint_text"], padding=True, truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score
import numpy as np

# Load model with label mappings
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# Training config (targeting high confidence)
training_args = TrainingArguments(
    output_dir="./muril_nlp_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    report_to="none"
)

# Metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.603183,0.19


In [None]:
import torch

def detect_language(text):
    devanagari = sum('\u0900' <= ch <= '\u097F' for ch in text)
    latin = sum('a' <= ch.lower() <= 'z' for ch in text)
    if devanagari > 0 and latin > 0:
        return "Hinglish"
    elif devanagari > 0:
        return "Hindi"
    elif latin > 0:
        return "English"
    return "Unknown"

def predict_intent(text):
    cleaned = clean_text(text)
    inputs = tokenizer(cleaned, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)

        # Print debug info for trainer if needed
        for i, score in enumerate(probs[0]):
            print(f"{id2label[i]}: {round(score.item(), 4)}")

        conf, pred = torch.max(probs, dim=1)

    return {
        "text_input": text,
        "language_detected": detect_language(text),
        "predicted_intent": id2label[pred.item()],
        "confidence_score": round(conf.item(), 2)
    }

In [None]:
print(predict_intent("Hamare factory mein power cut hai"))
print(predict_intent("पानी की पाइपलाइन फटी हुई है"))
print(predict_intent("Road par potholes hain, gadi chalana mushkil hai"))

In [18]:
df['label'].head()

Unnamed: 0,label
0,0
1,1
2,1
3,2
4,3


In [None]:
# 2. Split dataset
from sklearn.model_selection import train_test_split
from datasets import Dataset

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df[['complaint_text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['complaint_text', 'label']])

In [None]:
# 3. Load MuRIL model and tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "google/muril-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

In [None]:
# 4. Tokenize the datasets
def tokenize(batch):
    return tokenizer(batch["complaint_text"], padding=True, truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# 5. Training with Hugging Face Trainer
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score
import numpy as np

# Re-initializing TrainingArguments and Trainer
training_args = TrainingArguments(
    output_dir="./muril_intent_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.0,
    load_best_model_at_end=True,
    logging_dir="./logs",
    push_to_hub=False
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

trainer = Trainer(
    model=model, # Ensure model is also loaded/available
    args=training_args,
    train_dataset=train_dataset, # Ensure train_dataset is tokenized and available
    eval_dataset=test_dataset,   # Ensure test_dataset is tokenized and available
    compute_metrics=compute_metrics
)

# Train again
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.58702,0.91
2,No log,1.4875,1.0
3,No log,1.42678,1.0
4,No log,1.392563,1.0
5,No log,1.381433,1.0


TrainOutput(global_step=250, training_loss=1.489930419921875, metrics={'train_runtime': 1462.1407, 'train_samples_per_second': 1.368, 'train_steps_per_second': 0.171, 'total_flos': 18500494392000.0, 'train_loss': 1.489930419921875, 'epoch': 5.0})

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
predictions = trainer.predict(test_dataset)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)
from sklearn.metrics import accuracy_score
print("Overall Accuracy:", round(accuracy_score(y_true, y_pred) * 100, 2), "%")
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=id2label.values()))
print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))

Overall Accuracy: 100.0 %

Classification Report:
                                   precision    recall  f1-score   support

    Infrastructure_Road_Condition       1.00      1.00      1.00        25
         Waste_Management_Concern       1.00      1.00      1.00        18
             Land_Allotment_Query       1.00      1.00      1.00        23
Infrastructure_Water_Supply_Issue       1.00      1.00      1.00        19
      Infrastructure_Power_Outage       1.00      1.00      1.00        15

                         accuracy                           1.00       100
                        macro avg       1.00      1.00      1.00       100
                     weighted avg       1.00      1.00      1.00       100


Confusion Matrix:
[[25  0  0  0  0]
 [ 0 18  0  0  0]
 [ 0  0 23  0  0]
 [ 0  0  0 19  0]
 [ 0  0  0  0 15]]


In [None]:
# 7. Custom inference function
import torch

def predict_intent(text):
    """
    Predicts the intent of a given complaint text using the trained model.

    Args:
        text (str): The complaint text.

    Returns:
        str: The predicted intent label.
    """
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Move inputs to the same device as the model
    device = model.device
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted label
    predictions = torch.argmax(outputs.logits, dim=1).item()
    predicted_label = id2label[predictions]

    return predicted_label

In [None]:
print(predict_intent("Bijli nahi hai since morning"))
print(predict_intent("Kab milega plot ka status?"))
print(predict_intent("Gutter ka paani road par aagaya hai"))
print(predict_intent("Road bohot kharab hai factory ke paas"))
print(predict_intent("Water pipeline damage ho gaya hai"))
print(predict_intent("factory mein बिजली नहीं aa rahi hai"))
print(predict_intent("safai karne wala nahi aata colony mein"))
print(predict_intent("There is no power in my area"))
print(predict_intent("हमारे यहाँ 2 दिन से पानी नहीं आ रहा है"))

Infrastructure_Power_Outage
Land_Allotment_Query
Infrastructure_Water_Supply_Issue
Infrastructure_Road_Condition
Infrastructure_Water_Supply_Issue
Infrastructure_Power_Outage
Waste_Management_Concern
Infrastructure_Power_Outage
Infrastructure_Water_Supply_Issue


In [None]:
print(predict_intent("I have been facing continuous issues with the water supply in our industrial area There has been no water coming through the pipeline for the past three days, and our production has completely stopped because of this. We have already raised the issue multiple times, but no action has been taken. Please resolve the water supply problem on priority as it is affecting business operations."))

Infrastructure_Water_Supply_Issue


In [None]:
import torch
import re

# Clean + detect language helpers
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\u0900-\u097F\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def detect_language(text):
    devanagari = sum('\u0900' <= ch <= '\u097F' for ch in text)
    latin = sum('a' <= ch.lower() <= 'z' for ch in text)
    if devanagari > 0 and latin > 0:
        return "Hinglish"
    elif devanagari > 0:
        return "Hindi"
    elif latin > 0:
        return "English"
    return "Unknown"

# Final JSON output predictor
def predict_intent(text):
    cleaned = clean_text(text)
    inputs = tokenizer(cleaned, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        conf, pred = torch.max(probs, dim=1)
    return {
        "text_input": text,
        "language_detected": detect_language(text),
        "predicted_intent": id2label[pred.item()],
        "confidence_score": round(conf.item(), 2)
    }

# ✅ Example
print(predict_intent("Hamare factory mein power cut hai"))

{'text_input': 'Hamare factory mein power cut hai', 'language_detected': 'English', 'predicted_intent': 'Infrastructure_Power_Outage', 'confidence_score': 0.25}
