In [None]:
!pip install fasttext
!pip install transformers==4.36.2 tokenizers==0.15.2

In [None]:
!git clone https://github.com/AI4Bharat/IndicLID.git

In [None]:
%cd "/content/IndicLID/Inference"

In [None]:
%mkdir models
%cd "/content/IndicLID/Inference/models"

In [None]:
!wget https://github.com/AI4Bharat/IndicLID/releases/download/v1.0/indiclid-bert.zip
!wget https://github.com/AI4Bharat/IndicLID/releases/download/v1.0/indiclid-ftn.zip
!wget https://github.com/AI4Bharat/IndicLID/releases/download/v1.0/indiclid-ftr.zip

In [None]:
!unzip indiclid-bert.zip
!unzip indiclid-ftn.zip
!unzip indiclid-ftr.zip

In [None]:
# %cd "/content/IndicLID/"
%cd "/content/IndicLID/Inference"

In [None]:
import torch
from transformers.models.bert.modeling_bert import (BertForSequenceClassification, BertModel, BertEmbeddings, BertEncoder,
BertLayer, BertAttention, BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput, BertPooler)
from torch.nn.modules.sparse import Embedding
from torch.nn.modules.normalization import LayerNorm
from torch.nn.modules import dropout, container, linear, activation
from transformers.activations import GELUActivation
from transformers.models.bert.configuration_bert import BertConfig


torch.serialization.add_safe_globals([BertForSequenceClassification, BertModel, BertEmbeddings,
Embedding, LayerNorm, dropout.Dropout, BertEncoder, container.ModuleList, BertLayer, BertAttention,
BertSelfAttention, linear.Linear, BertSelfOutput, BertIntermediate, GELUActivation, torch._C._nn.gelu,
BertOutput, BertConfig, BertPooler, activation.Tanh])

from ai4bharat.IndicLID import IndicLID

IndicLID_model = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)

In [None]:
import pandas as pd
roman_test = pd.read_csv("/content/roman_test.csv")

In [None]:
batch_size = 32
test_samples = list(roman_test["processed"])
import time
start = time.time()
outputs = IndicLID_model.batch_predict(test_samples, batch_size)
end = time.time()

In [None]:
suffixes = ["_Latn", "_Deva", "_Arab"]
pred_label = []
for _, label, _, _ in outputs:
    for suf in suffixes:
        label = label.replace(suf, "")
    pred_label.append(label)

In [None]:
code_to_lang = {
    'asm': 'Assamese',
    'ben': 'Bangla',
    'brx': 'Bodo',
    'guj': 'Gujarati',
    'hin': 'Hindi',
    'kan': 'Kannada',
    'kas': 'Kashmiri',
    'kok': 'Konkani',
    'mai': 'Maithili',
    'mal': 'Malayalam',
    'mni': 'Manipuri',
    'mar': 'Marathi',
    'nep': 'Nepali',
    'ori': 'Oriya',
    'pan': 'Punjabi',
    'san': 'Sanskrit',
    'snd': 'Sindhi',
    'tam': 'Tamil',
    'tel': 'Telugu',
    'urd': 'Urdu',
    'eng': 'English',
    'other': 'Other'
}

In [None]:
pred = []
for label in pred_label:
  label = code_to_lang.get(label)
  pred.append(label)

In [None]:
result = pd.DataFrame()
result["text"] = list(roman_test["processed"])
result["true"] = list(roman_test["label"])
result["pred"] = pred

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_true = result['true'].tolist()
preds_final = result['pred'].tolist()

accuracy = accuracy_score(y_true, preds_final)
precision = precision_score(y_true, preds_final, average='macro', zero_division=0)
recall = recall_score(y_true, preds_final, average='macro', zero_division=0)
f1 = f1_score(y_true, preds_final, average='macro', zero_division=0)

print("Roman Accuracy:", accuracy*100)
print("Roman Precision (macro):", precision*100)
print("Roman Recall (macro):", recall*100)
print("Roman F1-score (macro):", f1*100)
print(f"Throughput: {len(roman_test)/(end - start)} sentence/seconds")

In [None]:
print("============= After filtering out those texts which are predicted as English and Others ==============")

result = result[~result["pred"].isin(["English", "Other"])]
y_true = result['true'].tolist()
preds_final = result['pred'].tolist()

accuracy = accuracy_score(y_true, preds_final)
precision = precision_score(y_true, preds_final, average='macro', zero_division=0)
recall = recall_score(y_true, preds_final, average='macro', zero_division=0)
f1 = f1_score(y_true, preds_final, average='macro', zero_division=0)

print("Roman Accuracy:", accuracy*100)
print("Roman Precision (macro):", precision*100)
print("Roman Recall (macro):", recall*100)
print("Roman F1-score (macro):", f1*100)
print(f"Throughput: {len(roman_test)/(end - start)} sentence/seconds")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

def plot_confusion_matrix(df_results, title):
    # Get unique labels from both true and pred columns
    labels = sorted(set(df_results['true'].unique()) | set(df_results['pred'].unique()))

    cm = confusion_matrix(df_results['true'], df_results['pred'], labels=labels)

    plt.figure(figsize=(15, 12))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(title)
    plt.show()

plot_confusion_matrix(result, "Confusion Matrix")

In [None]:
cm_raw = confusion_matrix(y_true, preds_final, labels=np.unique(y_true))

cm_normalized = cm_raw.astype('float') / cm_raw.sum(axis=1)[:, np.newaxis]
cm_df = pd.DataFrame(cm_normalized, index=np.unique(y_true), columns=np.unique(y_true))

plt.figure(figsize=(8, 6))
sns.set(font_scale=0.8)

# Create the heatmap
sns.heatmap(
    cm_df,
    cmap="Reds",
    cbar_kws={'label': 'Proportion Predicted Correctly (Normalized)'}
)

# Customization
plt.title('Confusion Matrix(IndicLID on roman testset)', fontsize=16)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import re

token_limits = range(1, 31)
accuracies = []

true_labels = roman_test["label"].tolist()

for n in token_limits:
    truncated_samples = [" ".join(text.split()[:n]) for text in roman_test["processed"]]

    outputs = IndicLID_model.batch_predict(truncated_samples, batch_size=32)

    suffixes = ["_Latn", "_Deva", "_Arab"]
    pred_label = []
    for _, label, _, _ in outputs:
        for suf in suffixes:
            label = label.replace(suf, "")
        pred_label.append(label)

    preds = [code_to_lang.get(label, label) for label in pred_label]

    result_temp = pd.DataFrame({
        "true": true_labels,
        "pred": preds
    })
    result_filtered = result_temp[~result_temp["pred"].isin(["English", "Other"])].reset_index(drop=True)

    y_true = result_filtered["true"].tolist()
    y_pred = result_filtered["pred"].tolist()
    acc = accuracy_score(y_true, y_pred) * 100
    accuracies.append(acc)

    print(f"n={n} → accuracy={acc:.4f}")

plt.figure(figsize=(8, 5))
plt.plot(token_limits, accuracies, marker='o', linewidth=2)
plt.title("Accuracy vs Number of Tokens")
plt.xlabel("Number of tokens (n)")
plt.ylabel("Accuracy")
plt.grid(True)
plt.show()
