In [2]:
import os

SAVE_MODEL_DIR = "/content/drive/MyDrive/sono_ai"
os.makedirs(SAVE_MODEL_DIR, exist_ok=True)


In [21]:
import os
import librosa
import numpy as np
import tensorflow_hub as hub
import numpy as np
import joblib

DATASET_PATH = "/content/drive/MyDrive/Dataset"
SAMPLE_RATE = 16000
DURATION = 40

vggish = hub.load("https://tfhub.dev/google/vggish/1")

def extract_vggish_embeddings(file_path):
    try:
        y, sr = librosa.load(file_path, sr=SAMPLE_RATE, mono=True, duration=DURATION)
        waveform = y.astype(np.float32)
        embedding = vggish(waveform)
        return np.mean(embedding, axis=0)
    except Exception as e:
        print(f"Error in {file_path}: {e}")
        return None

X, genres, langs = [], [], []

if not os.path.exists(DATASET_PATH):
    print("Dataset path does not exist:", DATASET_PATH)

genre_folders = [g for g in os.listdir(DATASET_PATH) if os.path.isdir(os.path.join(DATASET_PATH, g))]
print("Genres found:", genre_folders)

for genre in genre_folders:
    genre_path = os.path.join(DATASET_PATH, genre)
    subfolders = [os.path.join(genre_path, sf) for sf in os.listdir(genre_path)
                  if os.path.isdir(os.path.join(genre_path, sf))]

    if subfolders:
        print(f"Genre {genre} has language subfolders: {[os.path.basename(sf) for sf in subfolders]}")
        for lang_path in subfolders:
            lang = os.path.basename(lang_path)
            files = [f for f in os.listdir(lang_path) if f.lower().endswith(('.mp3', '.wav'))]
            print(f"Processing {len(files)} files in {genre}/{lang}")
            for file in files:
                fpath = os.path.join(lang_path, file)
                feat = extract_vggish_embeddings(fpath)
                if feat is not None:
                    X.append(feat)
                    genres.append(genre)
                    langs.append(lang)
    else:
        files = [f for f in os.listdir(genre_path) if f.lower().endswith(('.mp3', '.wav'))]
        print(f"Genre {genre} has {len(files)} files directly")
        for file in files:
            fpath = os.path.join(genre_path, file)
            feat = extract_vggish_embeddings(fpath)
            if feat is not None:
                X.append(feat)
                genres.append(genre)
                langs.append("Unknown")

print(f"Extracted features count: {len(X)}")


np.save("features.npy", np.array(X))
np.save("genres.npy", np.array(genres))
np.save("langs.npy", np.array(langs))


Genres found: ['Hip-Hop', 'film', 'Pop', 'metal', 'Folk', 'Devotional', 'EDM', 'Rock', 'Classical']
Genre Hip-Hop has language subfolders: ['Tamil', 'Hindi', 'Malayalam', 'English', 'Spanish']
Processing 54 files in Hip-Hop/Tamil
Processing 53 files in Hip-Hop/Hindi
Processing 48 files in Hip-Hop/Malayalam
Processing 25 files in Hip-Hop/English
Processing 25 files in Hip-Hop/Spanish
Genre film has language subfolders: ['Hindi', 'Tamil', 'English', 'Telugu']
Processing 44 files in film/Hindi
Processing 43 files in film/Tamil
Processing 31 files in film/English
Processing 48 files in film/Telugu
Genre Pop has language subfolders: ['English', 'Korean']
Processing 113 files in Pop/English
Processing 75 files in Pop/Korean
Genre metal has 533 files directly
Genre Folk has language subfolders: ['Telugu', 'English', 'Gujarati', 'Punjabi', 'Rajasthani']
Processing 69 files in Folk/Telugu
Processing 32 files in Folk/English
Processing 43 files in Folk/Gujarati
Processing 33 files in Folk/Punjab

In [29]:
import joblib

model_path = '/content/drive/MyDrive/sono_ai/music_multi_model_smote.pkl'
joblib.dump(clf, model_path)


['/content/drive/MyDrive/sono_ai/music_multi_model_smote.pkl']

In [31]:
import numpy as np

np.save('/content/drive/MyDrive/sono_ai/genre_labels.npy', np.unique(genres))
np.save('/content/drive/MyDrive/sono_ai/lang_labels.npy', np.unique(langs))


In [39]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels
import joblib

# --- Load Data ---
X = np.load("features.npy")
genres = np.load("genres.npy")
langs = np.load("langs.npy")

# --- Encode Labels ---
genre_encoder = LabelEncoder()
lang_encoder = LabelEncoder()

y_genre = genre_encoder.fit_transform(genres)
y_lang = lang_encoder.fit_transform(langs)

# --- Split Data (with stratify) ---
X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(
    X, y_genre, test_size=0.2, random_state=42, stratify=y_genre
)
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(
    X, y_lang, test_size=0.2, random_state=42, stratify=y_lang
)

# --- Train Genre Model ---
print("üéµ Training Genre Model...")
clf_genre = RandomForestClassifier(n_estimators=150, random_state=42)
clf_genre.fit(X_train_g, y_train_g)
y_pred_g = clf_genre.predict(X_test_g)

genre_acc = accuracy_score(y_test_g, y_pred_g)
print(f"‚úÖ Genre Accuracy: {genre_acc:.2f}")
labels_g = unique_labels(y_test_g, y_pred_g)
print(
    classification_report(
        y_test_g,
        y_pred_g,
        labels=labels_g,
        target_names=genre_encoder.inverse_transform(labels_g),
        zero_division=0
    )
)

# --- Train Language Model ---
print("\nüó£Ô∏è Training Language Model...")
clf_lang = RandomForestClassifier(n_estimators=150, random_state=42)
clf_lang.fit(X_train_l, y_train_l)
y_pred_l = clf_lang.predict(X_test_l)

lang_acc = accuracy_score(y_test_l, y_pred_l)
print(f"‚úÖ Language Accuracy: {lang_acc:.2f}")
labels_l = unique_labels(y_test_l, y_pred_l)
print(
    classification_report(
        y_test_l,
        y_pred_l,
        labels=labels_l,
        target_names=lang_encoder.inverse_transform(labels_l),
        zero_division=0
    )
)

# --- Save Models + Encoders ---
joblib.dump(clf_genre, "/content/drive/MyDrive/sono_ai/genre_classifier.pkl")
joblib.dump(clf_lang, "/content/drive/MyDrive/sono_ai/lang_classifier.pkl")
joblib.dump(genre_encoder, "/content/drive/MyDrive/sono_ai/genre_encoder.pkl")
joblib.dump(lang_encoder, "/content/drive/MyDrive/sono_ai/lang_encoder.pkl")

print("\nüéâ Models and encoders saved successfully in Google Drive!")


üéµ Training Genre Model...
‚úÖ Genre Accuracy: 0.71
              precision    recall  f1-score   support

   Classical       0.00      0.00      0.00         6
  Devotional       0.85      0.48      0.61        23
         EDM       0.70      0.70      0.70        56
        Folk       0.52      0.56      0.54        41
     Hip-Hop       0.74      0.83      0.78        41
         Pop       0.81      0.68      0.74        38
        Rock       1.00      0.20      0.33         5
        film       0.35      0.42      0.38        33
       metal       0.86      0.95      0.91       107

    accuracy                           0.71       350
   macro avg       0.65      0.54      0.56       350
weighted avg       0.71      0.71      0.71       350


üó£Ô∏è Training Language Model...
‚úÖ Language Accuracy: 0.70
                           precision    recall  f1-score   support

 Bengali devotional songs       0.00      0.00      0.00         2
Bhojpuri devotional songs       0.00      

In [40]:
import numpy as np
unique, counts = np.unique(langs, return_counts=True)
print(dict(zip(unique, counts)))


{np.str_('Bengali devotional songs'): np.int64(10), np.str_('Bhojpuri devotional songs'): np.int64(5), np.str_('English'): np.int64(441), np.str_('Gujarati'): np.int64(43), np.str_('Gujarati devotional songs'): np.int64(6), np.str_('Hindi'): np.int64(172), np.str_('Hindi devotional songs'): np.int64(18), np.str_('Korean'): np.int64(75), np.str_('Malayalam'): np.int64(48), np.str_('Marathi devotional songs'): np.int64(3), np.str_('Punjabi'): np.int64(33), np.str_('Punjabi devotional songs'): np.int64(4), np.str_('Rajasthani'): np.int64(28), np.str_('Spanish'): np.int64(25), np.str_('Tamil'): np.int64(97), np.str_('Telugu'): np.int64(117), np.str_('Unknown'): np.int64(533), np.str_('kannada'): np.int64(22), np.str_('tamil devotional songs'): np.int64(17), np.str_('telugu devotional songs'): np.int64(50)}


In [41]:
langs_cleaned = []
for l in langs:
    l = l.lower().replace(" devotional songs", "")  # remove devotional suffix
    l = l.strip().capitalize()  # normalize case
    langs_cleaned.append(l)


In [42]:
unique, counts = np.unique(langs_cleaned, return_counts=True)
print(dict(zip(unique, counts)))


{np.str_('Bengali'): np.int64(10), np.str_('Bhojpuri'): np.int64(5), np.str_('English'): np.int64(441), np.str_('Gujarati'): np.int64(49), np.str_('Hindi'): np.int64(190), np.str_('Kannada'): np.int64(22), np.str_('Korean'): np.int64(75), np.str_('Malayalam'): np.int64(48), np.str_('Marathi'): np.int64(3), np.str_('Punjabi'): np.int64(37), np.str_('Rajasthani'): np.int64(28), np.str_('Spanish'): np.int64(25), np.str_('Tamil'): np.int64(114), np.str_('Telugu'): np.int64(167), np.str_('Unknown'): np.int64(533)}


In [43]:
clf_lang = RandomForestClassifier(class_weight='balanced')


In [44]:
lang_encoder.fit(langs_cleaned)
y_lang = lang_encoder.transform(langs_cleaned)


In [46]:
import os

base_path = "/content/drive/MyDrive/sono_ai"
for f in os.listdir(base_path):
    print(f)


genre_labels.npy
music_multi_model_smote.pkl
lang_labels.npy
genre_classifier.pkl
lang_classifier.pkl
genre_encoder.pkl
lang_encoder.pkl


In [59]:
import joblib
import tensorflow_hub as hub
import tensorflow as tf
import librosa
import numpy as np

# === Load trained models & encoders ===
genre_model = joblib.load("/content/drive/MyDrive/sono_ai/genre_classifier.pkl")
lang_model = joblib.load("/content/drive/MyDrive/sono_ai/lang_classifier.pkl")
genre_encoder = joblib.load("/content/drive/MyDrive/sono_ai/genre_encoder.pkl")
lang_encoder = joblib.load("/content/drive/MyDrive/sono_ai/lang_encoder.pkl")

# === Load VGGish feature extractor ===
vggish = hub.load("https://tfhub.dev/google/vggish/1")

SAMPLE_RATE = 16000
DURATION = 40

# === Extract mean VGGish embedding ===
def extract_vggish_features(audio_path):
    y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, duration=DURATION)
    y = np.pad(y, (0, max(0, SAMPLE_RATE * DURATION - len(y))), mode="constant")
    waveform = tf.convert_to_tensor(y, dtype=tf.float32)
    features = vggish(waveform).numpy()
    return np.mean(features, axis=0).reshape(1, -1)

# === Prediction function ===
def predict_genre_language(audio_path):
    feats = extract_vggish_features(audio_path)

    # Predict genre
    genre_pred = genre_model.predict(feats)
    genre_label = genre_encoder.inverse_transform(genre_pred)[0]

    # Predict language
    lang_pred = lang_model.predict(feats)
    lang_label = lang_encoder.inverse_transform(lang_pred)[0]

    return genre_label, lang_label

# === Test with a sample song ===
test_audio = "/content/drive/MyDrive/Dataset/Classical/kannada/Ashraya_Ne_Nede_Amma-128kbps_01.mp3"  # change as needed
genre, lang = predict_genre_language(test_audio)

print("üéß Predicted Genre:", genre)
print("üó£Ô∏è Predicted Language:", lang)


üéß Predicted Genre: Classical
üó£Ô∏è Predicted Language: kannada
