In [1]:
%%capture
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import librosa
import warnings
import torchaudio.transforms as T

warnings.filterwarnings('ignore')

In [3]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Set random seeds for reproducibility
SEED = 93
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [5]:
# Define parameters
SAMPLE_RATE = 16000
MAX_AUDIO_LENGTH = 8  # in seconds
FEATURE_DIM = 256
BATCH_SIZE = 16
NUM_EPOCHS = 10 #Keep it low for now
LEARNING_RATE = 0.0001

In [6]:
# Load dataset from CSV files
print("Loading dataset...")
train_df = pd.read_csv("/kaggle/input/multilingual-indian-speech-data/metadata/train.csv")
test_df = pd.read_csv("/kaggle/input/multilingual-indian-speech-data/metadata/test.csv")
print(f"Train samples: {len(train_df)}, Test samples: {len(test_df)}")

Loading dataset...
Train samples: 31102, Test samples: 2635


In [7]:
# Define audio paths
AUDIO_DIR = "/kaggle/input/multilingual-indian-speech-data/audio"

# Augmentation Module
class SpectrogramAugmentation(nn.Module):
    def __init__(self, time_mask_param=40, freq_mask_param=20, noise_factor=0.02):
        super(SpectrogramAugmentation, self).__init__()
        self.time_mask = T.TimeMasking(time_mask_param)
        self.freq_mask = T.FrequencyMasking(freq_mask_param)
        self.noise_factor = noise_factor

    def forward(self, spec):
        if self.training:
            #Time Masking
            spec = self.time_mask(spec)

            #Frequency Masking
            spec = self.freq_mask(spec)
            
            #Gaussian Noise
            noise = torch.randn_like(spec) * self.noise_factor
            spec = spec + noise
        return spec

# Function to load audio file
def load_audio(file_id):
    file_path = os.path.join(AUDIO_DIR, f"{file_id}.wav")
    try:
        audio, sr = librosa.load(file_path, sr=None)
        return audio, sr
    except Exception as e:
        print(f"Error loading audio file {file_id}: {e}")
        return None, None

# Function to extract audio features
def extract_features(audio, sr=SAMPLE_RATE, max_length=MAX_AUDIO_LENGTH):
    if audio is None:
        return None
    
    # Resample if needed
    if sr != SAMPLE_RATE:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=SAMPLE_RATE)
    
    # Pad or trim to fixed length
    target_length = max_length * SAMPLE_RATE
    if len(audio) > target_length:
        audio = audio[:target_length]
    else:
        audio = np.pad(audio, (0, max(0, target_length - len(audio))), mode='constant')
    
    # Extract Mel spectrogram features
    mel_spec = librosa.feature.melspectrogram(
        y=audio, 
        sr=SAMPLE_RATE,
        n_mels=FEATURE_DIM,
        hop_length=512
    )
    
    # Convert to log scale
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Normalize
    log_mel_spec = (log_mel_spec - log_mel_spec.mean()) / (log_mel_spec.std() + 1e-8)
    
    return log_mel_spec

# Feature extraction for training data
def extract_and_save_features(df, output_dir, is_train=True):
    os.makedirs(output_dir, exist_ok=True)
    
    features_list = []
    labels_list = []
    ids_list = []
    
    for i, row in enumerate(tqdm(df.itertuples(), total=len(df), desc="Extracting features")):
        file_id = row.id
        audio, sr = load_audio(file_id)
        
        if audio is None:
            continue
        
        try:
            features = extract_features(audio, sr)
            if features is None:
                continue
            
            features_list.append(features)
            
            if is_train:
                labels_list.append(row.is_tts)
            else:
                labels_list.append(-1)  # For test set
            
            ids_list.append(file_id)
            
            # Save in batches to avoid memory issues
            if (i + 1) % 1000 == 0 or i == len(df) - 1:
                batch_idx = i // 1000
                np.save(f"{output_dir}/features_batch_{batch_idx}.npy", np.array(features_list))
                np.save(f"{output_dir}/labels_batch_{batch_idx}.npy", np.array(labels_list))
                np.save(f"{output_dir}/ids_batch_{batch_idx}.npy", np.array(ids_list))
                
                features_list = []
                labels_list = []
                ids_list = []
        
        except Exception as e:
            print(f"Error processing audio {file_id}: {e}")
            continue


In [8]:
# Create directories for features
os.makedirs("features", exist_ok=True)
train_features_dir = "features/train"
test_features_dir = "features/test"

# Extract and save features
print("Extracting features for training data...")
extract_and_save_features(train_df, train_features_dir, is_train=True)
print("Extracting features for test data...")
extract_and_save_features(test_df, test_features_dir, is_train=False)

# Function to load saved features
def load_features(features_dir):
    all_features = []
    all_labels = []
    all_ids = []
    
    feature_files = [f for f in os.listdir(features_dir) if f.startswith('features_batch_')]
    feature_files.sort()
    
    for batch_idx in range(len(feature_files)):
        features = np.load(f"{features_dir}/features_batch_{batch_idx}.npy")
        labels = np.load(f"{features_dir}/labels_batch_{batch_idx}.npy")
        ids = np.load(f"{features_dir}/ids_batch_{batch_idx}.npy")
        
        all_features.append(features)
        all_labels.append(labels)
        all_ids.append(ids)
    
    return np.vstack(all_features), np.concatenate(all_labels), np.concatenate(all_ids)

# Load features
print("Loading saved features...")
train_features, train_labels, train_ids = load_features(train_features_dir)
test_features, _, test_ids = load_features(test_features_dir)

# Split training data for validation
train_features, val_features, train_labels, val_labels = train_test_split(
    train_features, train_labels, test_size=0.1, random_state=SEED, stratify=train_labels
)

print(f"Training features shape: {train_features.shape}")
print(f"Validation features shape: {val_features.shape}")
print(f"Test features shape: {test_features.shape}")

Extracting features for training data...


Extracting features:   0%|          | 0/31102 [00:00<?, ?it/s]

Extracting features for test data...


Extracting features:   0%|          | 0/2635 [00:00<?, ?it/s]

Loading saved features...
Training features shape: (27991, 256, 251)
Validation features shape: (3111, 256, 251)
Test features shape: (2635, 256, 251)


In [9]:
# Custom Dataset
class AudioFeatureDataset(Dataset):
    def __init__(self, features, labels=None, augment=False):
        self.features = features
        self.labels = labels
        self.augment = augment
        self.augmenter = SpectrogramAugmentation()

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = torch.FloatTensor(self.features[idx])
        if self.augment:
            feature = self.augmenter(feature)

        if self.labels is not None:
            label = torch.FloatTensor([self.labels[idx]])
            return feature, label
        return feature

# Create data loaders
train_dataset = AudioFeatureDataset(train_features, train_labels, augment=True) #Augment training data
val_dataset = AudioFeatureDataset(val_features, val_labels)
test_dataset = AudioFeatureDataset(test_features)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [10]:
# Model Definition (using a pretrained ResNet18)
class DeepfakeDetector(nn.Module):
    def __init__(self, pretrained=True):
        super(DeepfakeDetector, self).__init__()
        self.resnet = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=pretrained)
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        num_features = self.resnet.fc.in_features
        self.resnet.fc = nn.Sequential(
            nn.Linear(num_features, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        # Add channel dimension
        if x.ndim == 3:
            x = x.unsqueeze(1)
        return self.resnet(x)

# Initialize model, loss, and optimizer
model = DeepfakeDetector().to(device)
criterion = nn.BCELoss()

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)

# Optional: Create a learning rate scheduler to improve training
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=0.005,
    steps_per_epoch=len(train_loader),
    epochs=NUM_EPOCHS,
    pct_start=0.2,
    anneal_strategy='cos',
    div_factor=10.0,
    final_div_factor=1000.0
)

Downloading: "https://github.com/pytorch/vision/zipball/v0.10.0" to /root/.cache/torch/hub/v0.10.0.zip
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 343MB/s]


In [11]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
import numpy as np

# Training function
def train_epoch(model, dataloader, criterion, optimizer, device, scheduler=None):
    model.train()
    running_loss = 0.0
    all_predictions = []
    all_targets = []
    all_pred_classes = []
    
    for features, labels in tqdm(dataloader, desc="Training"):
        features, labels = features.to(device), labels.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        if scheduler:
            scheduler.step()
        
        running_loss += loss.item() * features.size(0)
        all_predictions.extend(outputs.detach().cpu().numpy())
        all_targets.extend(labels.cpu().numpy())
        
        # For accuracy, precision, and recall, we need class predictions (0 or 1)
        pred_classes = (outputs > 0.5).float().cpu().numpy()
        all_pred_classes.extend(pred_classes)
    
    epoch_loss = running_loss / len(dataloader.dataset)
    
    # Convert lists to numpy arrays for metric calculation
    all_predictions = np.array(all_predictions)
    all_targets = np.array(all_targets)
    all_pred_classes = np.array(all_pred_classes)
    
    # Calculate metrics
    auc_score = roc_auc_score(all_targets, all_predictions)
    accuracy = accuracy_score(all_targets, all_pred_classes)
    precision = precision_score(all_targets, all_pred_classes, average='weighted', zero_division=0)
    recall = recall_score(all_targets, all_pred_classes, average='weighted', zero_division=0)
    
    return epoch_loss, auc_score, accuracy, precision, recall

In [12]:
# Validation function
def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_predictions = []
    all_targets = []
    all_pred_classes = []
    
    with torch.no_grad():
        for features, labels in tqdm(dataloader, desc="Validating"):
            features, labels = features.to(device), labels.to(device)
            
            outputs = model(features)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item() * features.size(0)
            all_predictions.extend(outputs.cpu().numpy())
            all_targets.extend(labels.cpu().numpy())
            
            # For accuracy, precision, and recall
            pred_classes = (outputs > 0.5).float().cpu().numpy()
            all_pred_classes.extend(pred_classes)
    
    val_loss = running_loss / len(dataloader.dataset)
    
    # Convert lists to numpy arrays for metric calculation
    all_predictions = np.array(all_predictions)
    all_targets = np.array(all_targets)
    all_pred_classes = np.array(all_pred_classes)
    
    # Calculate metrics
    val_auc = roc_auc_score(all_targets, all_predictions)
    val_accuracy = accuracy_score(all_targets, all_pred_classes)
    val_precision = precision_score(all_targets, all_pred_classes, average='weighted', zero_division=0)
    val_recall = recall_score(all_targets, all_pred_classes, average='weighted', zero_division=0)
    
    return val_loss, val_auc, val_accuracy, val_precision, val_recall

# Training loop
print("Starting training...")
best_val_auc = 0.0
best_val_loss= 1.0

for epoch in range(NUM_EPOCHS):
    # Train one epoch
    train_loss, train_auc, train_acc, train_precision, train_recall = train_epoch(
        model, train_loader, criterion, optimizer, device, scheduler
    )
    
    # Validate
    val_loss, val_auc, val_acc, val_precision, val_recall = validate(
        model, val_loader, criterion, device
    )
    
    # Print metrics
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}")
    print(f"Train - Loss: {train_loss:.4f}, AUC: {train_auc:.4f}, Acc: {train_acc:.4f}, Prec: {train_precision:.4f}, Recall: {train_recall:.4f}")
    print(f"Val   - Loss: {val_loss:.4f}, AUC: {val_auc:.4f}, Acc: {val_acc:.4f}, Prec: {val_precision:.4f}, Recall: {val_recall:.4f}")
    
    # Save the best model
    # if val_auc > best_val_auc:
    #     best_val_auc = val_auc
    #     torch.save(model.state_dict(), "best_model.pth")
    #     print(f"Model saved with validation AUC: {val_auc:.4f}")
    #temp by loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pth")
        print(f"Model saved with validation loss: {val_loss:.4f}")

# Load the best model
model.load_state_dict(torch.load("best_model.pth"))

Starting training...


Training:   0%|          | 0/1750 [00:00<?, ?it/s]

Validating:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 1/10
Train - Loss: 0.1978, AUC: 0.9766, Acc: 0.9176, Prec: 0.9176, Recall: 0.9176
Val   - Loss: 0.0619, AUC: 0.9981, Acc: 0.9814, Prec: 0.9816, Recall: 0.9814
Model saved with validation loss: 0.0619


Training:   0%|          | 0/1750 [00:00<?, ?it/s]

Validating:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 2/10
Train - Loss: 0.1215, AUC: 0.9914, Acc: 0.9601, Prec: 0.9601, Recall: 0.9601
Val   - Loss: 0.0274, AUC: 0.9998, Acc: 0.9920, Prec: 0.9921, Recall: 0.9920
Model saved with validation loss: 0.0274


Training:   0%|          | 0/1750 [00:00<?, ?it/s]

Validating:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 3/10
Train - Loss: 0.0893, AUC: 0.9952, Acc: 0.9717, Prec: 0.9717, Recall: 0.9717
Val   - Loss: 0.0122, AUC: 0.9999, Acc: 0.9965, Prec: 0.9965, Recall: 0.9965
Model saved with validation loss: 0.0122


Training:   0%|          | 0/1750 [00:00<?, ?it/s]

Validating:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 4/10
Train - Loss: 0.0483, AUC: 0.9985, Acc: 0.9831, Prec: 0.9831, Recall: 0.9831
Val   - Loss: 0.0099, AUC: 1.0000, Acc: 0.9977, Prec: 0.9978, Recall: 0.9977
Model saved with validation loss: 0.0099


Training:   0%|          | 0/1750 [00:00<?, ?it/s]

Validating:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 5/10
Train - Loss: 0.0350, AUC: 0.9992, Acc: 0.9889, Prec: 0.9889, Recall: 0.9889
Val   - Loss: 0.0313, AUC: 1.0000, Acc: 0.9923, Prec: 0.9924, Recall: 0.9923


Training:   0%|          | 0/1750 [00:00<?, ?it/s]

Validating:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 6/10
Train - Loss: 0.0250, AUC: 0.9996, Acc: 0.9927, Prec: 0.9927, Recall: 0.9927
Val   - Loss: 0.0115, AUC: 1.0000, Acc: 0.9952, Prec: 0.9952, Recall: 0.9952


Training:   0%|          | 0/1750 [00:00<?, ?it/s]

Validating:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 7/10
Train - Loss: 0.0136, AUC: 0.9999, Acc: 0.9954, Prec: 0.9954, Recall: 0.9954
Val   - Loss: 0.0025, AUC: 1.0000, Acc: 0.9994, Prec: 0.9994, Recall: 0.9994
Model saved with validation loss: 0.0025


Training:   0%|          | 0/1750 [00:00<?, ?it/s]

Validating:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 8/10
Train - Loss: 0.0108, AUC: 0.9999, Acc: 0.9970, Prec: 0.9970, Recall: 0.9970
Val   - Loss: 0.0017, AUC: 1.0000, Acc: 0.9997, Prec: 0.9997, Recall: 0.9997
Model saved with validation loss: 0.0017


Training:   0%|          | 0/1750 [00:00<?, ?it/s]

Validating:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 9/10
Train - Loss: 0.0037, AUC: 1.0000, Acc: 0.9988, Prec: 0.9988, Recall: 0.9988
Val   - Loss: 0.0013, AUC: 1.0000, Acc: 0.9997, Prec: 0.9997, Recall: 0.9997
Model saved with validation loss: 0.0013


Training:   0%|          | 0/1750 [00:00<?, ?it/s]

Validating:   0%|          | 0/195 [00:00<?, ?it/s]

Epoch 10/10
Train - Loss: 0.0022, AUC: 1.0000, Acc: 0.9992, Prec: 0.9992, Recall: 0.9992
Val   - Loss: 0.0013, AUC: 1.0000, Acc: 0.9997, Prec: 0.9997, Recall: 0.9997


<All keys matched successfully>

In [13]:
# Predict on test data
model.eval()
test_predictions = []

with torch.no_grad():
    for features in tqdm(test_loader, desc="Predicting on test data"):
        features = features.to(device)
        outputs = model(features)
        test_predictions.extend(outputs.cpu().numpy())

# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'is_tts': np.array(test_predictions).flatten()
})

# Ensure predictions are probabilities between 0 and 1
submission['is_tts'] = submission['is_tts'].clip(0, 1)

# Save submission file
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

Predicting on test data:   0%|          | 0/165 [00:00<?, ?it/s]

Submission file created: submission.csv
