In [None]:
!pip install datasets librosa torchaudio transformers lightgbm xgboost scikit-learn numpy pandas tqdm

In [None]:
import numpy as np
import pandas as pd
import librosa
import torch
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm
from datasets import load_dataset
from sklearn.preprocessing import StandardScaler
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
from scipy.special import softmax

In [None]:
dataset = load_dataset("SherryT997/IndicTTS-Deepfake-Challenge-Data")
train_data = dataset["train"]
test_data = dataset["test"]

In [None]:
def extract_mel_spectrogram(audio_array, sr):
    mel_spec = librosa.feature.melspectrogram(y=np.array(audio_array), sr=sr, n_mels=128, fmax=8000)
    return librosa.power_to_db(mel_spec, ref=np.max).mean(axis=1)

In [None]:
# Extract mel-spectrogram features from training data
X, y = [], []
for sample in tqdm(train_data, desc="Extracting mel features (train)"):
    X.append(extract_mel_spectrogram(sample['audio']['array'], sample['audio']['sampling_rate']))
    y.append(sample['is_tts'])
X = np.array(X)
y = np.array(y)

# Extract mel-spectrogram features from test data
X_test = []
test_ids = []
for sample in tqdm(test_data, desc="Extracting mel features (test)"):
    X_test.append(extract_mel_spectrogram(sample['audio']['array'], sample['audio']['sampling_rate']))
    test_ids.append(sample['id'])
X_test = np.array(X_test)

In [None]:
# LightGBM model on mel-spectrogram features
lgb_model = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=7)
lgb_model.fit(X, y)
lgb_train_preds = lgb_model.predict_proba(X)[:, 1]
lgb_test_preds = lgb_model.predict_proba(X_test)[:, 1]

# XGBoost model on mel-spectrogram features
xgb_model = xgb.XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=7, eval_metric='logloss')
xgb_model.fit(X, y)
xgb_train_preds = xgb_model.predict_proba(X)[:, 1]
xgb_test_preds = xgb_model.predict_proba(X_test)[:, 1]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)

def extract_wav2vec_features(data, desc="Extracting Wav2Vec2 features"):
    """Extracts Wav2Vec2 embeddings for each sample in the data."""
    features = []
    for sample in tqdm(data, desc=desc):
        waveform = torch.tensor(sample['audio']['array']).to(device)
        waveform = waveform.unsqueeze(0)  # Add batch dimension

        inputs = feature_extractor(
            waveform.cpu().numpy(),  # Feature extractor accepts numpy arrays
            return_tensors="pt",
            sampling_rate=sample['audio']['sampling_rate']
        )

        # Move inputs to GPU if available
        for key in inputs:
            inputs[key] = inputs[key].to(device)

        with torch.no_grad():
            # Take the mean of the last hidden state as embedding
            embeddings = wav2vec_model(**inputs).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
        features.append(embeddings)
    return np.array(features)

# Extract Wav2Vec2 embeddings for train and test datasets
wav2vec_features_train = extract_wav2vec_features(train_data, desc="Extracting Wav2Vec2 features (train)")
wav2vec_features_test  = extract_wav2vec_features(test_data,  desc="Extracting Wav2Vec2 features (test)")

# For simplicity, take the mean of embeddings per sample (you can change this aggregation if needed)
wav2vec_train_preds = wav2vec_features_train.mean(axis=1)
wav2vec_test_preds = wav2vec_features_test.mean(axis=1)

In [None]:
stacked_train = np.vstack([lgb_train_preds, xgb_train_preds, wav2vec_train_preds]).T
stacked_test  = np.vstack([lgb_test_preds,  xgb_test_preds,  wav2vec_test_preds]).T

# Scale the stacked features
scaler = StandardScaler()
stacked_train_scaled = scaler.fit_transform(stacked_train)
stacked_test_scaled  = scaler.transform(stacked_test)

# Train meta-model using the stacked predictions from the training set
meta_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05)
meta_model.fit(stacked_train_scaled, y)

# Predict on the test set using the meta-model
raw_meta_preds = meta_model.predict_proba(stacked_test_scaled)
meta_preds = softmax(raw_meta_preds, axis=1)[:, 1]

In [None]:
submission = pd.DataFrame({"id": test_ids, "is_tts": meta_preds})
submission.to_csv("submission.csv", index=False)
print("Submission file saved: submission.csv")