## Libraries

In [15]:
pip install python-magic -q

In [16]:
import os
import ast
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd
import warnings
import magic
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import VotingClassifier

## Loading Data

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# Use the magic library to identify the file type
file_path = "/content/drive/MyDrive/model_input_002"
file_type = magic.from_file(file_path)

print(f"File type: {file_type}")

File type: Apache Parquet


In [14]:
# Load the parquet file
df = pd.read_parquet(file_path, engine='pyarrow')

# Display the first few rows
print(df.head())

# Display basic information
print(df.info())

  track_id dataset                                         audio_path   label  \
0        2     fma  /project_data_source/free_music_archive/fma_sm...  hiphop   
1        5     fma  /project_data_source/free_music_archive/fma_sm...  hiphop   
2       10     fma  /project_data_source/free_music_archive/fma_sm...     pop   
3      140     fma  /project_data_source/free_music_archive/fma_sm...    folk   
4      141     fma  /project_data_source/free_music_archive/fma_sm...    folk   

  fma_genre_top fma_genres fma_genres_all  sampling_rate  \
0       Hip-Hop       [21]           [21]        44100.0   
1       Hip-Hop       [21]           [21]        44100.0   
2           Pop       [10]           [10]        44100.0   
3          Folk       [17]           [17]        44100.0   
4          Folk       [17]           [17]        44100.0   

                                            features  
0  [3683.9976, 1.2325847, -0.45687148, 3495.6704,...  
1  [2928.3987, 0.6809109, 0.022613911, 338

## Models

### GridSearch with Best Parameters for Random Forest and XGBoost


In [None]:
# Step 1: Safely evaluate the 'features' column
df['features'] = df['features'].apply(lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else x)

# Step 2: Drop rows with missing features
df = df.dropna(subset=['features'])

# Step 3: Prepare features and labels
X = np.stack(df['features'].values)  # Convert to numpy matrix
y = df['label'].values               # Extract labels

# Step 4: Encode labels into numerical values
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # Convert genres to numeric values

# Check the label encoding
print(f"Encoded Labels: {le.classes_}")

# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Step 6: Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Step 7: Define the parameter grids for Grid Search

# Random Forest parameter grid
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# XGBoost parameter grid
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0]
}

# Step 8: Perform Grid Search on Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
rf_grid_search.fit(X_train_balanced, y_train_balanced)

# Get the best Random Forest model and parameters
best_rf = rf_grid_search.best_estimator_
print("Best Random Forest Parameters:", rf_grid_search.best_params_)

# Step 9: Perform Grid Search on XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=xgb_param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
xgb_grid_search.fit(X_train_balanced, y_train_balanced)

# Get the best XGBoost model and parameters
best_xgb = xgb_grid_search.best_estimator_
print("Best XGBoost Parameters:", xgb_grid_search.best_params_)

# Step 10: Evaluate both models on the test set

# Random Forest evaluation
y_pred_rf = best_rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("\nBest Random Forest Accuracy:", accuracy_rf)
print("Classification Report for Random Forest:\n", classification_report(y_test, y_pred_rf, target_names=le.classes_))

# XGBoost evaluation
y_pred_xgb = best_xgb.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("\nBest XGBoost Accuracy:", accuracy_xgb)
print("Classification Report for XGBoost:\n", classification_report(y_test, y_pred_xgb, target_names=le.classes_))

Encoded Labels: ['blues' 'classical' 'country' 'electronic' 'experimental' 'folk' 'hiphop'
 'instrumental' 'international' 'jazz' 'pop' 'rock' 'soulrnb']
Training set size: (7196, 15)
Test set size: (1800, 15)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best XGBoost Parameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}

Best Random Forest Accuracy: 0.515
Classification Report for Random Forest:
                precision    recall  f1-score   support

        blues       0.60      0.94      0.73        16
    classical       0.73      0.89      0.80        18
      country       0.40      0.70      0.51        23
   electronic       0.50      0.44      0.47       193
 experimental       0.47      0.38      0.42       206
         folk       0.52      0.59      0.55       196
    

### Weighted Soft Voting Ensemble with SMOTE for Improved Classification Performance (Random Forest and XGBoost)

In [None]:
# Step 1: Safely evaluate the 'features' column
df['features'] = df['features'].apply(lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else x)

# Step 2: Drop rows with missing features
df = df.dropna(subset=['features'])

# Step 3: Prepare features and labels
X = np.stack(df['features'].values)  # Convert to numpy matrix
y = df['label'].values               # Extract labels

# Step 4: Encode labels into numerical values
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # Convert genres to numeric values

# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Step 6: Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 7: Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

# Step 8: Define models with the best parameters
best_rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,  # No limit on tree depth
    min_samples_split=2,
    random_state=42
)

best_xgb = XGBClassifier(
    n_estimators=300,
    max_depth=7,
    learning_rate=0.2,
    subsample=0.8,
    eval_metric='mlogloss',
    random_state=42
)

# Step 9: Create the weighted soft voting ensemble
ensemble_model_weighted = VotingClassifier(
    estimators=[
        ('rf', best_rf),   # Random Forest
        ('xgb', best_xgb)  # XGBoost
    ],
    voting='soft',
    weights=[2, 3]  # Favor XGBoost due to higher performance
)

# Step 10: Train the ensemble model
ensemble_model_weighted.fit(X_train_balanced, y_train_balanced)

# Step 11: Make predictions on the test data
y_pred_weighted = ensemble_model_weighted.predict(X_test_scaled)

# Step 12: Evaluate the model's performance
accuracy_weighted = accuracy_score(y_test, y_pred_weighted)
classification_report_weighted = classification_report(y_test, y_pred_weighted, target_names=le.classes_)

# Output the results
print("Accuracy with Optimized Soft Voting Ensemble (RF + XGBoost):", accuracy_weighted)
print("Classification Report:\n", classification_report_weighted)

Accuracy with Optimized Soft Voting Ensemble (RF + XGBoost): 0.5244444444444445
Classification Report:
                precision    recall  f1-score   support

        blues       0.54      0.88      0.67        16
    classical       0.75      0.83      0.79        18
      country       0.48      0.70      0.57        23
   electronic       0.53      0.45      0.49       193
 experimental       0.45      0.39      0.42       206
         folk       0.52      0.58      0.55       196
       hiphop       0.68      0.67      0.68       249
 instrumental       0.55      0.51      0.53       200
international       0.50      0.53      0.52       228
         jazz       0.55      0.75      0.63        16
          pop       0.33      0.34      0.34       198
         rock       0.59      0.56      0.58       242
      soulrnb       0.38      0.67      0.49        15

     accuracy                           0.52      1800
    macro avg       0.53      0.61      0.56      1800
 weighted avg 

### Weighted Soft Voting Ensemble with SMOTE for Improved Classification Performance (Random Forest, XGBoost, KNN, SVM)

In [None]:
# Step 1: Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

# Step 2: Define models with best parameters
best_rf = RandomForestClassifier(
    n_estimators=300, max_depth=None, min_samples_split=2, random_state=42
)

best_xgb = XGBClassifier(
    n_estimators=300, max_depth=7, learning_rate=0.2, subsample=0.8,
    eval_metric='mlogloss', random_state=42
)

# Add KNN and SVM models
knn_model = KNeighborsClassifier(n_neighbors=5)
svm_model = SVC(probability=True, kernel='rbf', random_state=42)

# Step 3: Create the weighted soft voting ensemble
ensemble_model_weighted = VotingClassifier(
    estimators=[
        ('rf', best_rf),   # Random Forest
        ('xgb', best_xgb),  # XGBoost
        ('knn', knn_model),  # K-Nearest Neighbors
        ('svm', svm_model)   # Support Vector Machine
    ],
    voting='soft',
    weights=[2, 3, 1, 1]  # Adjust weights based on performance
)

# Step 4: Train the ensemble model
ensemble_model_weighted.fit(X_train_balanced, y_train_balanced)

# Step 5: Make predictions on the test data
y_pred_weighted = ensemble_model_weighted.predict(X_test_scaled)

# Step 6: Evaluate the model's performance
accuracy_weighted = accuracy_score(y_test, y_pred_weighted)
classification_report_weighted = classification_report(y_test, y_pred_weighted, target_names=le.classes_)

# Output the results
print("Accuracy with Extended Soft Voting Ensemble:", accuracy_weighted)
print("Classification Report:\n", classification_report_weighted)


Accuracy with Extended Soft Voting Ensemble: 0.5361111111111111
Classification Report:
                precision    recall  f1-score   support

        blues       0.60      0.94      0.73        16
    classical       0.71      0.83      0.77        18
      country       0.48      0.65      0.56        23
   electronic       0.56      0.47      0.51       193
 experimental       0.47      0.40      0.43       206
         folk       0.53      0.60      0.56       196
       hiphop       0.69      0.69      0.69       249
 instrumental       0.55      0.53      0.54       200
international       0.51      0.55      0.53       228
         jazz       0.57      0.81      0.67        16
          pop       0.34      0.32      0.33       198
         rock       0.58      0.57      0.58       242
      soulrnb       0.44      0.73      0.55        15

     accuracy                           0.54      1800
    macro avg       0.54      0.62      0.57      1800
 weighted avg       0.53      