In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/audio-clustering1/Dataset/Audio_Features.pdf
/kaggle/input/audio-clustering1/Dataset/train_labels.csv
/kaggle/input/audio-clustering1/Dataset/train_folder/611.wav
/kaggle/input/audio-clustering1/Dataset/train_folder/364.wav
/kaggle/input/audio-clustering1/Dataset/train_folder/367.wav
/kaggle/input/audio-clustering1/Dataset/train_folder/116.wav
/kaggle/input/audio-clustering1/Dataset/train_folder/1490.wav
/kaggle/input/audio-clustering1/Dataset/train_folder/374.wav
/kaggle/input/audio-clustering1/Dataset/train_folder/1359.wav
/kaggle/input/audio-clustering1/Dataset/train_folder/485.wav
/kaggle/input/audio-clustering1/Dataset/train_folder/456.wav
/kaggle/input/audio-clustering1/Dataset/train_folder/626.wav
/kaggle/input/audio-clustering1/Dataset/train_folder/590.wav
/kaggle/input/audio-clustering1/Dataset/train_folder/296.wav
/kaggle/input/audio-clustering1/Dataset/train_folder/1180.wav
/kaggle/input/audio-clustering1/Dataset/train_folder/1312.wav
/kaggle/input/audio-cluste

In [None]:
import os
import torch
import torchaudio
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from transformers import ASTFeatureExtractor, ASTModel
from huggingface_hub import login

# ==== SETUP ====
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Login to HuggingFace (replace with your own token if needed)
login("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")

# Load AST model and feature extractor
feature_extractor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
ast_model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(device)
ast_model.eval()

# ==== FEATURE EXTRACTION ====
def extract_ast_embedding(file_path, chunk_duration=3.0):
    waveform, sr = torchaudio.load(file_path)

    # === RESAMPLE if not 16kHz ===
    target_sr = 16000
    if sr != target_sr:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
        waveform = resampler(waveform)
        sr = target_sr

    chunk_size = int(sr * chunk_duration)
    embeddings = []

    for start in tqdm(range(0, waveform.shape[1], chunk_size), desc=f"Chunks in {os.path.basename(file_path)}", leave=False):
        end = min(start + chunk_size, waveform.shape[1])
        chunk = waveform[:, start:end]

        if chunk.shape[1] < chunk_size:
            pad = torch.zeros((1, chunk_size - chunk.shape[1]))
            chunk = torch.cat([chunk, pad], dim=1)

        inputs = feature_extractor(chunk.numpy(), sampling_rate=sr, return_tensors="pt", padding=True)
        with torch.no_grad():
            outputs = ast_model(**{k: v.to(device) for k, v in inputs.items()})
            embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten())

    return np.mean(embeddings, axis=0)


def load_features(audio_dir, label_df=None, save_path=None):
    features_file = f"{save_path}_features.csv" if save_path else None
    labels_file = f"{save_path}_labels.csv" if save_path else None
    filenames_file = f"{save_path}_filenames.csv" if save_path else None
    
    # Check if features already exist
    if save_path and os.path.exists(features_file) and os.path.exists(filenames_file):
        print(f"🔄 Loading precomputed features from {features_file}")
        features = pd.read_csv(features_file).values
        filenames = pd.read_csv(filenames_file)['filename'].values.tolist()
        
        labels = None
        if os.path.exists(labels_file):
            labels = pd.read_csv(labels_file)['label'].values
            
        return features, labels, filenames
        
    features, labels, filenames = [], [], []
    files = os.listdir(audio_dir)

    for file in tqdm(files, desc=f"Processing {audio_dir}"):
        if not file.endswith(".wav"):
            continue
        try:
            emb = extract_ast_embedding(os.path.join(audio_dir, file))
            features.append(emb)
            filenames.append(file)
            if label_df is not None:
                label = label_df[label_df['filename'] == file]['category'].values[0]
                labels.append(label)
        except Exception as e:
            print(f"⚠ Error with {file}: {e}")

    # Save features to CSV if path provided
    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        
        # Create DataFrames and save
        features_df = pd.DataFrame(features)
        features_df.to_csv(features_file, index=False)
        
        filenames_df = pd.DataFrame({'filename': filenames})
        filenames_df.to_csv(filenames_file, index=False)
        
        if labels:
            labels_df = pd.DataFrame({'label': labels})
            labels_df.to_csv(labels_file, index=False)
            
        print(f"✅ Saved features to {features_file}")

    return np.array(features), np.array(labels) if labels else None, filenames

# ==== TEST PREDICTION ====
def predict_on_test_set(model, model_name, pca, test_audio_path, label_encoder=None, features_path=None, output_dir="predictions"):
    os.makedirs(output_dir, exist_ok=True)
    
    # Load or extract features
    X_test, _, test_filenames = load_features(
        test_audio_path, 
        save_path=features_path
    )
    
    # Apply PCA transformation
    X_test_pca = pca.transform(X_test)
    
    # Make predictions
    y_pred_encoded = model.predict(X_test_pca)
    
    # Convert numeric predictions back to original labels if encoder provided
    y_pred = label_encoder.inverse_transform(y_pred_encoded) if label_encoder else y_pred_encoded

    # Save predictions
    df = pd.DataFrame({
        "filename": test_filenames,
        "predicted_label": y_pred
    })
    csv_path = os.path.join(output_dir, f"{model_name}_predictions.csv")
    df.to_csv(csv_path, index=False)
    print(f"✅ Saved predictions to {csv_path}")
    
    return y_pred, test_filenames

# ==== MODEL TRAINING ====
def run_models(X_train, y_train, X_val, y_val, pca, test_audio_path, label_encoder=None, features_path=None):
    models = {
        'RandomForest': (RandomForestClassifier(), {
            'n_estimators': [50, 100],
            'max_depth': [10, None]
        }),
        'LogisticRegression': (LogisticRegression(max_iter=1000), {
            'C': [0.1, 1, 10]
        }),
        # 'SVM': (SVC(probability=True), {  # Enable probability for ensemble
        #     'C': [0.1, 1, 10],
        #     'kernel': ['linear', 'rbf']
        # }),
        'XGBoost': (XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), {
            'n_estimators': [50, 100],
            'max_depth': [3, 6]
        }),
        # 'GradientBoosting': (GradientBoostingClassifier(), {
        #     'n_estimators': [50, 100],
        #     'learning_rate': [0.01, 0.1]
        # }),
    }

    best_models = {}
    for name, (model, params) in models.items():
        print(f"\n🔍 Training {name}...")
        grid = GridSearchCV(model, params, cv=3, scoring='accuracy', n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        best_models[name] = best_model
        
        y_pred_encoded = best_model.predict(X_val)
        
        # Convert back to original labels for the classification report if encoder provided
        if label_encoder:
            y_val_original = label_encoder.inverse_transform(y_val)
            y_pred_original = label_encoder.inverse_transform(y_pred_encoded)
        else:
            y_val_original = y_val
            y_pred_original = y_pred_encoded

        print(f"✅ {name} Best Params: {grid.best_params_}")
        print(classification_report(y_val_original, y_pred_original))

        # Predict on test and save CSV
        predict_on_test_set(best_model, name, pca, test_audio_path, label_encoder, features_path)
    
    # Create ensemble model
    print("\n🔄 Creating Ensemble Model...")
    estimators = [(name, model) for name, model in best_models.items()]
    ensemble = VotingClassifier(estimators=estimators, voting='soft')
    ensemble.fit(X_train, y_train)
    
    # Evaluate ensemble on validation set
    y_pred_ensemble = ensemble.predict(X_val)
    if label_encoder:
        y_val_original = label_encoder.inverse_transform(y_val)
        y_pred_original = label_encoder.inverse_transform(y_pred_ensemble)
    else:
        y_val_original = y_val
        y_pred_original = y_pred_ensemble
    
    print("\n✨ Ensemble Model Performance:")
    print(classification_report(y_val_original, y_pred_original))
    
    # Predict with ensemble on test set
    predict_on_test_set(ensemble, "Ensemble", pca, test_audio_path, label_encoder, features_path)
    
    # Create weighted ensemble based on validation performance
    print("\n🔄 Creating Weighted Ensemble...")
    
    # Get validation accuracy for each model to use as weights
    weights = {}
    for name, model in best_models.items():
        y_pred = model.predict(X_val)
        accuracy = np.mean(y_pred == y_val)
        weights[name] = accuracy
    
    # Normalize weights
    total = sum(weights.values())
    weights = {k: v/total for k, v in weights.items()}
    
    print("Model weights based on validation performance:")
    for name, weight in weights.items():
        print(f"  - {name}: {weight:.4f}")
    
    weighted_ensemble = VotingClassifier(
        estimators=estimators,
        voting='soft',
        weights=[weights[name] for name, _ in estimators]
    )
    weighted_ensemble.fit(X_train, y_train)
    
    # Evaluate weighted ensemble
    y_pred_w_ensemble = weighted_ensemble.predict(X_val)
    if label_encoder:
        y_pred_w_original = label_encoder.inverse_transform(y_pred_w_ensemble)
    else:
        y_pred_w_original = y_pred_w_ensemble
    
    print("\n✨ Weighted Ensemble Performance:")
    print(classification_report(y_val_original, y_pred_w_original))
    
    # Predict with weighted ensemble on test set
    predict_on_test_set(weighted_ensemble, "WeightedEnsemble", pca, test_audio_path, label_encoder, features_path)

    return best_models, ensemble, weighted_ensemble

# ==== MAIN PIPELINE ====
if __name__ == "__main__":
    # === Define paths ===
    train_audio_path = "/kaggle/input/audio-clustering1/Dataset/train_folder"
    test_audio_path = "/kaggle/input/audio-clustering1/Dataset/test_folder"
    labels_csv_path = "/kaggle/input/audio-clustering1/Dataset/train_labels.csv"
    features_base_path = "/kaggle/working/features"
    
    # Create features directory
    os.makedirs(features_base_path, exist_ok=True)

    # === Load label file ===
    labels_df = pd.read_csv(labels_csv_path)

    # === Extract features from training data ===
    print("🔄 Extracting training features...")
    train_features_path = os.path.join(features_base_path, "train")
    X_train_full, y_train_full, _ = load_features(
        train_audio_path, 
        labels_df, 
        save_path=train_features_path
    )

    # === Extract features from test data ===
    print("🔄 Extracting test features...")
    test_features_path = os.path.join(features_base_path, "test")
    X_test, _, _ = load_features(
        test_audio_path,
        save_path=test_features_path
    )

    # === Encode labels ===
    print("🏷️ Encoding labels...")
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train_full)
    
    # === PCA ===
    print("📉 Applying PCA...")
    pca = PCA(n_components=128)
    X_train_pca = pca.fit_transform(X_train_full)
    
    # Save PCA model and transformed features
    import joblib
    pca_path = os.path.join(features_base_path, "pca_model.pkl")
    joblib.dump(pca, pca_path)
    print(f"✅ Saved PCA model to {pca_path}")
    
    # Save PCA transformed features
    pca_train_df = pd.DataFrame(X_train_pca)
    pca_train_path = os.path.join(features_base_path, "train_pca_features.csv")
    pca_train_df.to_csv(pca_train_path, index=False)
    print(f"✅ Saved PCA-transformed training features to {pca_train_path}")
    
    # === Train/Validation split ===
    print("✂ Splitting train/val...")
    X_train, X_val, y_train, y_val = train_test_split(X_train_pca, y_train_encoded, test_size=0.2, random_state=42)

    # === Train + Predict ===
    best_models, ensemble, weighted_ensemble = run_models(
        X_train, y_train, X_val, y_val, pca, test_audio_path, 
        label_encoder=label_encoder,
        features_path=test_features_path
    )
    
    # Save final models
    print("\n💾 Saving final models...")
    models_dir = os.path.join(features_base_path, "models")
    os.makedirs(models_dir, exist_ok=True)
    
    # Save individual models
    for name, model in best_models.items():
        model_path = os.path.join(models_dir, f"{name}_model.pkl")
        joblib.dump(model, model_path)
        print(f"✅ Saved {name} model to {model_path}")
    
    # Save ensemble models
    ensemble_path = os.path.join(models_dir, "ensemble_model.pkl")
    joblib.dump(ensemble, ensemble_path)
    print(f"✅ Saved Ensemble model to {ensemble_path}")
    
    weighted_ensemble_path = os.path.join(models_dir, "weighted_ensemble_model.pkl")
    joblib.dump(weighted_ensemble, weighted_ensemble_path)
    print(f"✅ Saved Weighted Ensemble model to {weighted_ensemble_path}")
    
    # Save label encoder
    encoder_path = os.path.join(models_dir, "label_encoder.pkl")
    joblib.dump(label_encoder, encoder_path)
    print(f"✅ Saved Label Encoder to {encoder_path}")
    
    print("\n🎉 Processing complete!")

preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

🔄 Extracting training features...


Processing /kaggle/input/audio-clustering1/Dataset/train_folder:   0%|          | 0/1500 [00:00<?, ?it/s]
Chunks in 611.wav:   0%|          | 0/2 [00:00<?, ?it/s][A
Chunks in 611.wav:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A
Chunks in 611.wav: 100%|██████████| 2/2 [00:00<00:00,  2.41it/s][A
Processing /kaggle/input/audio-clustering1/Dataset/train_folder:   0%|          | 1/1500 [00:01<30:45,  1.23s/it]
Chunks in 364.wav:   0%|          | 0/2 [00:00<?, ?it/s][A
Chunks in 364.wav: 100%|██████████| 2/2 [00:00<00:00, 13.54it/s][A
Processing /kaggle/input/audio-clustering1/Dataset/train_folder:   0%|          | 2/1500 [00:01<15:31,  1.61it/s]
Chunks in 367.wav:   0%|          | 0/2 [00:00<?, ?it/s][A
Chunks in 367.wav: 100%|██████████| 2/2 [00:00<00:00, 13.70it/s][A
Processing /kaggle/input/audio-clustering1/Dataset/train_folder:   0%|          | 3/1500 [00:01<10:38,  2.35it/s]
Chunks in 116.wav:   0%|          | 0/2 [00:00<?, ?it/s][A
Chunks in 116.wav: 100%|██████████| 2/2 