In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Metrics for model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv("/home/asmaa/music-genre-prediction/artifacts/cleaned_dataset.csv")

In [3]:
df.head(5)

Unnamed: 0,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,-1.0,0.941,0.792,A#,0.115,-5.201,Minor,0.0748,100.889,0.759,Electronic
1,Thievery Corporation,The Shining Path,31.0,0.0127,0.622,218293.0,0.89,0.95,D,0.124,-7.043,Minor,0.03,115.002,0.531,Electronic
2,Dillon Francis,Hurricane,28.0,0.00306,0.62,215613.0,0.755,0.0118,G#,0.534,-4.617,Major,0.0345,127.994,0.333,Electronic
3,Dubloadz,Nitro,34.0,0.0254,0.774,166875.0,0.7,0.00253,C#,0.157,-4.498,Major,0.239,128.014,0.27,Electronic
4,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.909,F#,0.157,-6.266,Major,0.0413,145.036,0.323,Electronic


In [4]:
df.shape

(50000, 16)

In [5]:
X = df.drop(columns=['music_genre','duration_ms'],axis=1)
X

Unnamed: 0,artist_name,track_name,popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,0.759
1,Thievery Corporation,The Shining Path,31.0,0.01270,0.622,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.002,0.531
2,Dillon Francis,Hurricane,28.0,0.00306,0.620,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,0.333
3,Dubloadz,Nitro,34.0,0.02540,0.774,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,0.270
4,What So Not,Divide & Conquer,32.0,0.00465,0.638,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,0.323
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,BEXEY,GO GETTA,59.0,0.03340,0.913,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.028,0.330
49996,Roy Woods,Drama (feat. Drake),72.0,0.15700,0.709,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.043,0.113
49997,Berner,Lovin' Me (feat. Smiggz),51.0,0.00597,0.693,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,0.395
49998,The-Dream,Shawty Is Da Shit,65.0,0.08310,0.782,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.886,0.354


In [6]:
print("Categories in 'key' variable:     ",end=" " )
print(df['key'].unique())


print("Categories in 'mode' variable:     ",end=" " )
print(df['mode'].unique())

Categories in 'key' variable:      ['A#' 'D' 'G#' 'C#' 'F#' 'B' 'G' 'F' 'A' 'C' 'E' 'D#']
Categories in 'mode' variable:      ['Minor' 'Major']


In [7]:
y = df['music_genre']
y

0        Electronic
1        Electronic
2        Electronic
3        Electronic
4        Electronic
            ...    
49995       Hip-Hop
49996       Hip-Hop
49997       Hip-Hop
49998       Hip-Hop
49999       Hip-Hop
Name: music_genre, Length: 50000, dtype: object

## Columns transformation

In [8]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude='object').columns
cat_features = ['key','mode']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [9]:
X = preprocessor.fit_transform(X)

In [10]:
X.shape

(50000, 24)

## Splitting the Dataset

In [11]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((40000, 24), (10000, 24))

## Function for Model Evaluation

In [12]:
def evaluate_model(true_values, predicted_values):
    # Calculate accuracy
    accuracy = accuracy_score(true_values, predicted_values)

    # Calculate precision, recall, and F1-score for each class (in multiclass cases)
    precision = precision_score(true_values, predicted_values, average=None)
    recall = recall_score(true_values, predicted_values, average=None)
    f1 = f1_score(true_values, predicted_values, average=None)

    # Calculate the confusion matrix
    confusion = confusion_matrix(true_values, predicted_values)

    # Generate a classification report with precision, recall, F1-score, and support for each class
    class_report = classification_report(true_values, predicted_values)

    return accuracy, precision, recall, f1, 
    

## Instantiate RFC

In [18]:
model = RandomForestClassifier()

# Hyperparameter Tuning for Random Forest Classifier

In [14]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [19]:
# Hyperparameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None],  # None means using all features
}


# Perform Grid Search or Randomized Search with cross-validation
grid_search_rf = GridSearchCV(model, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

# Best hyperparameters and model
best_params_rf = grid_search_rf.best_params_
best_model_rf = grid_search_rf.best_estimator_





TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

# Trainig and Testing

In [None]:
'''
Accuracy
Precision
Recall
F1
Confusion Matrix
Classification Report

'''
model.fit(X_train, y_train) # Train model

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Evaluate Train and Test dataset
accuracy, prec, recall, f1Score = evaluate_model(y_train, y_train_pred)

model_test_accuracy, model_test_prec , model_test_recall, model_test_f1Score = evaluate_model(y_test, y_test_pred)




print('Model performance for Training set')
print("- Accuracy: {}".format(accuracy))
print("- Precision: {}".format(prec))
print("- Recall: {}".format(recall))
print("- f1 Score: {}".format(f1Score))

print('----------------------------------')

print('Model performance for Test set')
print("- Accuracy: {:.4f}".format(model_test_accuracy))
print("- Precision: {}".format(model_test_prec))
print("- Recall: {}".format(model_test_recall))
print("- f1 Score: {}".format(model_test_f1Score))


