In [1]:
!pip install lightgbm xgboost numpy datasets librosa pandas  torchaudio torch scikit-learn torchvision tqdm evaluate
%pip install -U transformers accelerate bitsandbytes peft trl wandb

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import librosa
import torch
import random
import torchaudio
import torch.nn.functional as F
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm
from torch.utils.data import DataLoader
from datasets import load_dataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, Trainer, TrainingArguments
from scipy.special import softmax

In [3]:
dataset = load_dataset("SherryT997/IndicTTS-Deepfake-Challenge-Data")
train_data, test_data = dataset["train"], dataset["test"]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/35 [00:00<?, ?it/s]

In [4]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
# Define Mel Spectrogram transformation
mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000, n_fft=1024, hop_length=512, n_mels=64
).to(device)

# Define constants
MAX_TIME_STEPS = 90  # For Mel Spectrogram
MAX_AUDIO_DURATION = 6  # Max duration in seconds for Wav2Vec2
TARGET_SAMPLE_RATE = 16000
MAX_LENGTH = MAX_AUDIO_DURATION * TARGET_SAMPLE_RATE

# Load Wav2Vec2 processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# Preprocess function
def preprocess_audio(sample):
    audio = torch.tensor(sample["audio"]["array"], dtype=torch.float16).to(device)

    # Compute Mel Spectrogram
    mel_spec = mel_transform(audio.unsqueeze(0)).squeeze(0).to(torch.float16)
    mel_spec = F.pad(mel_spec, (0, MAX_TIME_STEPS - mel_spec.shape[1])) if mel_spec.shape[1] < MAX_TIME_STEPS else mel_spec[:, :MAX_TIME_STEPS]
    
    # Extract Wav2Vec2 Features
    wav2vec_inputs = processor(
        sample["audio"]["array"],
        sampling_rate=sample["audio"]["sampling_rate"],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )
    
    return {
        "mel_spec": mel_spec.cpu().numpy().astype(np.float16),
        "input_values": wav2vec_inputs["input_values"].squeeze(0).to(torch.float16),
        "labels": torch.tensor(sample["is_tts"], dtype=torch.long)
    }

# Apply preprocessing
preprocessed_train = train_data.map(preprocess_audio, remove_columns=["audio"])
preprocessed_test = test_data.map(preprocess_audio, remove_columns=["audio"])

In [6]:
def batch_extract_features(batch):
    # Here, batch is a dictionary of lists
    return {"X": batch["mel_spec"], "y": batch["labels"]}

# Process in batches:
extracted = preprocessed_train.map(batch_extract_features, batched=True, batch_size=64)
X = np.array(extracted["X"], dtype=np.float16)
y = np.array(extracted["y"], dtype=np.int8)

extracted_test = preprocessed_test.map(batch_extract_features, batched=True, batch_size=64)
X_test = np.array(extracted_test["X"], dtype=np.float16)


In [7]:
def stratified_split_indices_by_percent(dataset, train_percent=0.8, seed=42):
    """
    Compute stratified indices based on 'language' and 'is_tts' columns.
    For each group, allocate the first train_percent fraction for training
    and the rest for validation.
    """
    groups = {}
    # Build dictionary of indices for each group.
    for idx, sample in enumerate(tqdm(dataset, desc="Indexing dataset")):
        group = f"{sample['language']}_{sample['is_tts']}"
        groups.setdefault(group, []).append(idx)
    
    random.seed(seed)
    train_indices = []
    valid_indices = []
    
    # For each group, shuffle and split by percentage.
    for group, indices in groups.items():
        random.shuffle(indices)
        n = len(indices)
        # Determine training count based on percentage.
        train_count = int(n * train_percent)
        # Ensure at least one sample goes to training if possible.
        if n > 0 and train_count == 0:
            train_count = 1
        # Also ensure that if the group is large, at least one sample goes to validation.
        if train_count == n and n > 1:
            train_count = n - 1
        
        train_indices.extend(indices[:train_count])
        valid_indices.extend(indices[train_count:])
    
    random.shuffle(train_indices)
    random.shuffle(valid_indices)
    return train_indices, valid_indices

# Example usage:
# Splitting each group with 80% for training and 20% for validation.
train_indices, valid_indices = stratified_split_indices_by_percent(train_data, train_percent=0.8, seed=42)
filtered_train_data = preprocessed_train.select(train_indices)
filtered_valid_data = preprocessed_train.select(valid_indices)

print(f"Subsampled train dataset size: {len(filtered_train_data)} samples")
print(f"Subsampled valid dataset size: {len(filtered_valid_data)} samples")


Indexing dataset: 100%|██████████| 31102/31102 [04:45<00:00, 108.85it/s]


Subsampled train dataset size: 24871 samples
Subsampled valid dataset size: 6231 samples


In [8]:
import wandb

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

wb_token = user_secrets.get_secret("wandb_api_token")

wandb.login(key=wb_token)
run = wandb.init(
    project='NPPE 2', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m22f1001410[0m ([33m22f1001410-iitm[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [9]:
wav2vec_model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base-960h", 
    num_labels=2, 
    #torch_dtype=torch.float16, 
    gradient_checkpointing=True  # ✅ Saves memory, speeds up training
).to(device)

# Freeze feature extractor layers (first 7 conv layers)
for param in wav2vec_model.wav2vec2.feature_extractor.parameters():
    param.requires_grad = False

# Optionally freeze lower transformer layers (first N layers)
N = 8  # Adjust based on available memory
for layer in wav2vec_model.wav2vec2.encoder.layers[:N]:
    for param in layer.parameters():
        param.requires_grad = False
        
# Optimized Training Arguments
training_args = TrainingArguments(
    output_dir="./results", 
    eval_strategy="epoch", 
    save_strategy="epoch", 
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,  
    weight_decay=0.01, 
    save_total_limit=2, 
    fp16=True,
    logging_dir="./logs", 
    logging_steps=10, 
    load_best_model_at_end=True, 
    metric_for_best_model="roc_auc", 
    save_safetensors=True,
    disable_tqdm=False,
    report_to="none",
    remove_unused_columns=False
)

def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    try:
        auc = roc_auc_score(labels, logits[:, 1])
    except ValueError:
        auc = float("nan")
    return {"accuracy": acc, "f1": f1, "roc_auc": auc}

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
print(filtered_train_data.column_names)
# For training dataset
filtered_train_data = filtered_train_data.remove_columns(["text", "id", "language", "is_tts", "mel_spec"])
# For test dataset
filtered_valid_data = filtered_valid_data.remove_columns(["text", "id", "language", "is_tts", "mel_spec"])
print(filtered_train_data.column_names)


['text', 'id', 'language', 'is_tts', 'mel_spec', 'input_values', 'labels']
['input_values', 'labels']


In [11]:
from transformers.trainer_utils import get_last_checkpoint

trainer = Trainer(
    model=wav2vec_model, 
    args=training_args,
    train_dataset=filtered_train_data, 
    eval_dataset=filtered_valid_data,
    processing_class=processor,
    compute_metrics=compute_metrics
)

last_checkpoint = get_last_checkpoint(training_args.output_dir)

if last_checkpoint is not None:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("No checkpoint found, starting training from scratch.")
    trainer.train()

Resuming training from checkpoint: ./results/checkpoint-1555


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)


Epoch,Training Loss,Validation Loss


In [12]:
eval_results = trainer.evaluate()
print(f"Evaluation Metrics: {eval_results}")

Evaluation Metrics: {'eval_loss': 0.19093576073646545, 'eval_accuracy': 0.936286310383566, 'eval_f1': 0.9362496120338508, 'eval_roc_auc': 0.9856324319980548, 'eval_runtime': 564.7611, 'eval_samples_per_second': 11.033, 'eval_steps_per_second': 0.691, 'epoch': 1.0}


In [13]:
X_lxg = X.reshape(X.shape[0], -1)
X_test_lxg = X_test.reshape(X_test.shape[0], -1)

# LightGBM model on mel-spectrogram features
lgb_model = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=7)
lgb_model.fit(X_lxg, y)
lgb_train_preds = lgb_model.predict_proba(X_lxg)[:, 1]
lgb_test_preds = lgb_model.predict_proba(X_test_lxg)[:, 1]

# XGBoost model on mel-spectrogram features
xgb_model = xgb.XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=7, eval_metric='logloss')
xgb_model.fit(X_lxg, y)
xgb_train_preds = xgb_model.predict_proba(X_lxg)[:, 1]
xgb_test_preds = xgb_model.predict_proba(X_test_lxg)[:, 1]

[LightGBM] [Info] Number of positive: 15544, number of negative: 15558
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 4.553420 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1468800
[LightGBM] [Info] Number of data points in the train set: 31102, number of used features: 5760
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499775 -> initscore=-0.000900
[LightGBM] [Info] Start training from score -0.000900


In [14]:
# For training dataset
preprocessed_train = preprocessed_train.remove_columns(["text", "id", "language", "is_tts", "mel_spec", "labels"])
# For test dataset
preprocessed_test = preprocessed_test.remove_columns(["text", "id", "language", "is_tts", "mel_spec", "labels"])

In [15]:
# Predictions
train_preds_output = trainer.predict(preprocessed_train)
test_preds_output = trainer.predict(preprocessed_test)

wav2vec_train_preds = torch.nn.functional.softmax(torch.tensor(train_preds_output.predictions), dim=1)[:, 1].numpy()
wav2vec_test_preds = torch.nn.functional.softmax(torch.tensor(test_preds_output.predictions), dim=1)[:, 1].numpy()

In [16]:
# Stack predictions
stacked_train = np.column_stack([wav2vec_train_preds, lgb_train_preds, xgb_train_preds])
stacked_test = np.column_stack([wav2vec_test_preds, lgb_test_preds, xgb_test_preds])

# Scale features
scaler = StandardScaler()
stacked_train_scaled = scaler.fit_transform(stacked_train)
stacked_test_scaled = scaler.transform(stacked_test)

# Train meta-model (LightGBM)
lgb_train = lgb.Dataset(stacked_train_scaled, label=y)
meta_model = lgb.train({"objective": "binary", "metric": "auc", "learning_rate": 0.05}, lgb_train, num_boost_round=100)

# Predict final probabilities
meta_test_preds = meta_model.predict(stacked_test_scaled)

# Save submission
submission = pd.DataFrame({"id": [sample["id"] for sample in test_data], "is_tts": meta_test_preds})
submission.to_csv("submission.csv", index=False)
print("Submission file saved: submission.csv")

[LightGBM] [Info] Number of positive: 15544, number of negative: 15558
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000914 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 31102, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499775 -> initscore=-0.000900
[LightGBM] [Info] Start training from score -0.000900
Submission file saved: submission.csv
