# Assignment

## Imports

In [None]:
# used in the previous experiments
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer

# for training
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer

# for experiment tracking
import wandb


# common packages
import pandas as pd
from pprint import pprint
import json
import numpy as np
from matplotlib import pyplot as plt

In [None]:
from dotenv import load_dotenv
from huggingface_hub import login
import os

load_dotenv() 
login(token=os.getenv("HUGGINGFACE_HUB_TOKEN"))

## Dataset and Setup Information

In [None]:
from datasets import load_dataset, Dataset
import random
import itertools

# Set random seed for reproducibility
random.seed(42)

# Helper function to fetch first 500 and add language ID
def get_subset(dataset_name, lang_id, count=500):
    ds = load_dataset(dataset_name, split="train", streaming=True)
    # Use itertools.islice to take the first `count` examples
    subset = list(itertools.islice(ds, count))
    for example in subset:
        example["lang"] = lang_id
    return subset

# Load subsets with streaming
tamil_subset = get_subset("SPRINGLab/IndicVoices-R_Tamil", lang_id=0)
hindi_subset = get_subset("SPRINGLab/IndicVoices-R_Hindi", lang_id=1)
bengali_subset = get_subset("SPRINGLab/IndicVoices-R_Bengali", lang_id=2)

# Combine all subsets
combined = tamil_subset + hindi_subset + bengali_subset

# Shuffle combined list
random.shuffle(combined)

# Convert to HuggingFace Dataset
final_dataset = Dataset.from_list(combined)

# Optional: Print an example
print(final_dataset[0])


## 1.

In [None]:
ran_index = random.randint(0, 1499)
final_dataset[ran_index]['audio']['sampling_rate']

## 2.

In [None]:
hindi_subset[14]['speaker_id']

## 3.

In [None]:
#loading wav2vec model to extract features from audio files.
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from tqdm import tqdm
# Load the Wav2Vec2 processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
call_count=0
# def extract_wav2vec_features(batch):
#     global call_count
#     call_count += 1
#     print(f"Processing batch {call_count}")

#     # Get all audio arrays in the batch
#     audio_arrays = [sample["array"] for sample in batch["audio"]]
#     sampling_rate = 16000  # adjust if your audio isn't actually at 48kHz

#     # Tokenize with padding
#     inputs = processor(audio_arrays, sampling_rate=sampling_rate, return_tensors="pt", padding=True)

#     with torch.no_grad():
#         outputs = model(**inputs, output_hidden_states=True)

#     # Extract 9th and 11th layer hidden states and average
#     hs_9 = outputs.hidden_states[9]
#     hs_11 = outputs.hidden_states[11]
#     averaged_features = ((hs_9 + hs_11) / 2).mean(dim=1)  # mean over time (seq length)

#     return {
#         "features": [feat.numpy() for feat in averaged_features],  # one per sample
#         "label": batch["lang"]
#     }

In [None]:
# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)  # move model to GPU

def extract_wav2vec_features(batch):
    global call_count
    call_count += 1
    print(f"Processing batch {call_count}")

    # Get audio arrays
    audio_arrays = [sample["array"] for sample in batch["audio"]]
    sampling_rate = 16000

    # Tokenize
    inputs = processor(audio_arrays, sampling_rate=sampling_rate, return_tensors="pt", padding=True)

    # Move input tensors to GPU
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

    # Get 9th and 11th layer hidden states
    hs_9 = outputs.hidden_states[9]
    hs_11 = outputs.hidden_states[11]

    # Average over time (dimension 1)
    averaged_features = ((hs_9 + hs_11) / 2).mean(dim=1)

    return {
        "features": [feat.cpu().numpy() for feat in averaged_features],  # move back to CPU before converting to NumPy
        "label": batch["lang"]
    }



In [None]:
# Apply feature extraction
dataset = final_dataset.map(extract_wav2vec_features, batched=True, batch_size=4)

## 3.

In [None]:
ran_index = random.randint(0, 1499)
len(dataset[ran_index]['features'])

## 4.

In [None]:
dataset['features']
feature_matrix = np.vstack(dataset['features'])
feature_matrix.shape

## 5.

In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f'Total parameters: {total_params/1e6:0.2f} million')

## 6.

In [None]:
model.encoder

In [None]:
def extract_wav2vec_featuresM(batch):
    global call_count
    call_count += 1
    print(f"Processing batch {call_count}")

    # Get audio arrays
    audio_arrays = [sample["array"] for sample in batch["audio"]]
    sampling_rate = 16000

    # Tokenize
    inputs = processor(audio_arrays, sampling_rate=sampling_rate, return_tensors="pt", padding=True)

    # Move input tensors to GPU
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

    # Get 9th and 11th layer hidden states
    hs_7 = outputs.hidden_states[7]
    hs_12 = outputs.hidden_states[12]

    # Average over time (dimension 1)
    averaged_features = ((hs_7 + hs_12) / 2).mean(dim=1)

    return {
        "features": [feat.cpu().numpy() for feat in averaged_features],  # move back to CPU before converting to NumPy
        "label": batch["lang"]
    }

## 7.

In [None]:
datasetM = final_dataset.map(extract_wav2vec_featuresM, batched=True, batch_size=4)

In [None]:
print(f"The answer is {datasetM[0]['features'][0]:0.4f}")

## 8.

In [None]:
tamil_subset[24]['gender']

## 9.

In [28]:
datasetM

Dataset({
    features: ['text', 'lang', 'samples', 'verbatim', 'normalized', 'speaker_id', 'scenario', 'task_name', 'gender', 'age_group', 'job_type', 'qualification', 'area', 'district', 'state', 'occupation', 'utterance_pitch_mean', 'utterance_pitch_std', 'snr', 'c50', 'speaking_rate', 'cer', 'duration', 'audio', 'features', 'label'],
    num_rows: 1500
})

In [29]:
datasetMF = dataset.select_columns(['features', 'label'])

In [30]:
datasetMF

Dataset({
    features: ['features', 'label'],
    num_rows: 1500
})

In [31]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Extract features and labels from the list of dictionaries
X = np.array([example["features"] for example in datasetMF])
y = np.array([example["label"] for example in datasetMF])

# Step 1: Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 2: Further split training set into train/validation (80/20 split)
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Step 3: Train logistic regression model
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train_final, y_train_final)

# Step 4: Evaluate on validation set
y_pred_val = clf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val) * 100  # convert to percentage

print(f"Validation Accuracy: {val_accuracy:.2f}%")


Validation Accuracy: 68.75%


## 10.

In [32]:
y_test_pred = clf.predict(X_test)

In [34]:
test_accuracy = accuracy_score(y_test, y_test_pred) * 100  # convert to percentage

print(f"Test Accuracy: {test_accuracy:.2f}%")

Test Accuracy: 64.00%


# Tutorial