In [None]:
import kagglehub
import numpy as np
import pandas as pd
import os
import torch
import torchaudio
import torchaudio.transforms as T
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import WhisperProcessor, WhisperModel
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
path = kagglehub.dataset_download("sanskarhim/data-audio")
print(path)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
audio_files = os.listdir(path + '/preprocessed_dataset')

for files in audio_files:
    print(files)
print(len(audio_files))

In [None]:
label_map = {
    "bengali": 0, "gujarati": 1, "hindi": 2, "kannada": 3, "malayalam": 4,
    "marathi": 5, "urdu": 6, "tamil": 7, "telugu": 8
}
id2label = {v: k for k, v in label_map.items()}

In [None]:
def get_filepaths_and_labels(data_dir):
    filepaths, labels = [], []
    for filename in os.listdir(data_dir):
        if filename.endswith(".mp3") or filename.endswith(".wav"):
            lang_prefix = filename.split("_")[0].lower()
            if lang_prefix in label_map:
                filepaths.append(os.path.join(data_dir, filename))
                labels.append(label_map[lang_prefix])
    return filepaths, labels

In [None]:
class LIDDataset(Dataset):
    def __init__(self, filepaths, labels, processor):
        self.filepaths = filepaths
        self.labels = labels
        self.processor = processor
        self.resampler = T.Resample(orig_freq=48000, new_freq=16000)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        waveform, sr = torchaudio.load(self.filepaths[idx])
        if sr != 16000:
            waveform = self.resampler(waveform)
        inputs = self.processor(waveform[0], sampling_rate=16000, return_tensors="pt")
        return inputs.input_features.squeeze(0), self.labels[idx]

In [None]:
class WhisperClassifier(nn.Module):
    def __init__(self, whisper_model, num_classes):
        super().__init__()
        self.encoder = whisper_model.encoder
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.classifier = nn.Linear(whisper_model.config.d_model, num_classes)

    def forward(self, input_features):
        with torch.no_grad():
            encoder_outputs = self.encoder(input_features=input_features)
        x = encoder_outputs.last_hidden_state
        x = x.permute(0, 2, 1)
        x = self.pool(x).squeeze(2)
        return self.classifier(x)

In [None]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for features, labels in dataloader:
        features = features.to(device)
        labels = torch.tensor(labels, dtype=torch.long, device=device)
        logits = model(features)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

In [None]:
def evaluate(model, dataloader, device):
    model.eval()
    preds, targets = [], []
    correct, total = 0, 0
    with torch.no_grad():
        for features, labels in dataloader:
            features = features.to(device)
            logits = model(features)
            pred = torch.argmax(logits, dim=1).cpu()
            preds.extend(pred.numpy())
            targets.extend(labels)

            labels_tensor = torch.tensor(labels, dtype=torch.long)
            correct += (pred == labels_tensor).sum().item()
            total += labels_tensor.size(0)

    accuracy = correct / total
    return preds, targets, accuracy

In [None]:
def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=id2label.values(), yticklabels=id2label.values())
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

In [None]:
def plot_accuracy(train_acc, val_acc):
    epochs = range(1, len(train_acc) + 1)
    plt.figure()
    plt.plot(epochs, train_acc, label='Train Accuracy')
    plt.plot(epochs, val_acc, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Train vs Validation Accuracy')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
def save_model(model, processor, model_path="whisper_classifier.pt", processor_path="processor"):
    torch.save(model.state_dict(), model_path)
    processor.save_pretrained(processor_path)
    print(f"Model saved to {model_path}")
    print(f"Processor saved to {processor_path}")

In [None]:
def load_model(model_path, processor_path, base_model_name="openai/whisper-base", num_classes=10, device="cpu"):
    processor = WhisperProcessor.from_pretrained(processor_path)
    whisper_model = WhisperModel.from_pretrained(base_model_name)

    peft_config = LoraConfig(
        task_type=TaskType.FEATURE_EXTRACTION,
        inference_mode=True,
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=["q_proj", "k_proj", "v_proj", "out_proj"]
    )
    whisper_model = get_peft_model(whisper_model, peft_config)

    model = WhisperClassifier(whisper_model, num_classes=num_classes)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    return model, processor

In [None]:
def main():
    data_dir = "/kaggle/input/data-audio/preprocessed_dataset"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    filepaths, labels = get_filepaths_and_labels(data_dir)

    train_fp, temp_fp, train_lb, temp_lb = train_test_split(
        filepaths, labels, test_size=0.2, stratify=labels, random_state=42)
    val_fp, test_fp, val_lb, test_lb = train_test_split(
        temp_fp, temp_lb, test_size=0.5, stratify=temp_lb, random_state=42)

    model_name = "openai/whisper-base"
    processor = WhisperProcessor.from_pretrained(model_name)
    whisper_model = WhisperModel.from_pretrained(model_name)

    peft_config = LoraConfig(
        task_type=TaskType.FEATURE_EXTRACTION,
        inference_mode=False,
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=["q_proj", "k_proj", "v_proj", "out_proj"]
    )
    whisper_model = get_peft_model(whisper_model, peft_config)

    model = WhisperClassifier(whisper_model, num_classes=10).to(device)

    train_ds = LIDDataset(train_fp, train_lb, processor)
    val_ds = LIDDataset(val_fp, val_lb, processor)
    test_ds = LIDDataset(test_fp, test_lb, processor)

    train_dl = DataLoader(train_ds, batch_size=8, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=8)
    test_dl = DataLoader(test_ds, batch_size=8)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    train_accuracies = []
    val_accuracies = []

    for epoch in range(7):
        train_loss, train_acc = train(model, train_dl, optimizer, criterion, device)
        _, _, val_acc = evaluate(model, val_dl, device)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)
        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Train Acc = {train_acc:.4f}, Val Acc = {val_acc:.4f}")

    y_pred, y_true, _ = evaluate(model, test_dl, device)
    print("Test Accuracy:", accuracy_score(y_true, y_pred))

    labels_list = list(id2label.keys())
    target_names_list = [id2label[i] for i in labels_list]
    print("Classification Report:\n", classification_report(
        y_true, y_pred,
        labels=labels_list,
        target_names=target_names_list,
        zero_division=0
    ))

    plot_confusion_matrix(y_true, y_pred)
    plot_accuracy(train_accuracies, val_accuracies)

    # Save the trained model and processor
    save_model(model, processor)

In [None]:
if __name__ == "__main__":
    main()