In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Metrics for model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [3]:
df = pd.read_csv("/home/asmaa/music-genre-prediction/artifacts/cleaned_dataset.csv")

In [4]:
df.head(5)

Unnamed: 0,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,-1.0,0.941,0.792,A#,0.115,-5.201,Minor,0.0748,100.889,0.759,Electronic
1,Thievery Corporation,The Shining Path,31.0,0.0127,0.622,218293.0,0.89,0.95,D,0.124,-7.043,Minor,0.03,115.002,0.531,Electronic
2,Dillon Francis,Hurricane,28.0,0.00306,0.62,215613.0,0.755,0.0118,G#,0.534,-4.617,Major,0.0345,127.994,0.333,Electronic
3,Dubloadz,Nitro,34.0,0.0254,0.774,166875.0,0.7,0.00253,C#,0.157,-4.498,Major,0.239,128.014,0.27,Electronic
4,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.909,F#,0.157,-6.266,Major,0.0413,145.036,0.323,Electronic


In [5]:
df.shape

(50000, 16)

In [34]:
X = df.drop(columns=['music_genre','duration_ms','popularity'],axis=1)
X

Unnamed: 0,artist_name,track_name,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,Röyksopp,Röyksopp's Night Out,0.00468,0.652,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,0.759
1,Thievery Corporation,The Shining Path,0.01270,0.622,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.002,0.531
2,Dillon Francis,Hurricane,0.00306,0.620,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,0.333
3,Dubloadz,Nitro,0.02540,0.774,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,0.270
4,What So Not,Divide & Conquer,0.00465,0.638,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,0.323
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,BEXEY,GO GETTA,0.03340,0.913,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.028,0.330
49996,Roy Woods,Drama (feat. Drake),0.15700,0.709,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.043,0.113
49997,Berner,Lovin' Me (feat. Smiggz),0.00597,0.693,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,0.395
49998,The-Dream,Shawty Is Da Shit,0.08310,0.782,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.886,0.354


In [7]:
print("Categories in 'key' variable:     ",end=" " )
print(df['key'].unique())


print("Categories in 'mode' variable:     ",end=" " )
print(df['mode'].unique())

Categories in 'key' variable:      ['A#' 'D' 'G#' 'C#' 'F#' 'B' 'G' 'F' 'A' 'C' 'E' 'D#']
Categories in 'mode' variable:      ['Minor' 'Major']


In [None]:
y = df['music_genre']
y

## Columns transformation

In [35]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude='object').columns
cat_features = ['key','mode']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [36]:
X = preprocessor.fit_transform(X)

In [37]:
X.shape

(50000, 23)

## Splitting the Dataset

In [38]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((40000, 23), (10000, 23))

## Function for Model Evaluation

In [13]:
def evaluate_model(true_values, predicted_values):
    # Calculate accuracy
    accuracy = accuracy_score(true_values, predicted_values)

    # Calculate precision, recall, and F1-score for each class (in multiclass cases)
    precision = precision_score(true_values, predicted_values, average=None)
    recall = recall_score(true_values, predicted_values, average=None)
    f1 = f1_score(true_values, predicted_values, average=None)

    # Calculate the confusion matrix
    confusion = confusion_matrix(true_values, predicted_values)

    # Generate a classification report with precision, recall, F1-score, and support for each class
    class_report = classification_report(true_values, predicted_values)

    return accuracy, precision, recall, f1, 
    

## Select Models

In [14]:
models = {
    "Logistic Regression":LogisticRegression(max_iter=1000, C=1.0, random_state=42),
    "Naive Bayes":GaussianNB(),
    "K-Neighbors Classifier":KNeighborsClassifier(n_neighbors=5),
    "Decision Tree Classifier":DecisionTreeClassifier(max_depth=5, random_state=42),
    "Random Forest Classifier":RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Classifier":SVC(kernel='linear', C=1.0, random_state=42)
}

# Trainig and Testing

In [None]:
'''
   Accuracy
   Precision
   Recall
   F1
   Confusion Matrix
   Classification Report
   
'''
model_list =[]
accuracy_list = []
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    accuracy, prec, recall, f1Score = evaluate_model(y_train, y_train_pred)

    model_test_accuracy, model_test_prec , model_test_recall, model_test_f1Score = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {}".format(accuracy))
    print("- Precision: {}".format(prec))
    print("- Recall: {}".format(recall))
    print("- f1 Score: {}".format(f1Score))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(model_test_accuracy))
    print("- Precision: {}".format(model_test_prec))
    print("- Recall: {}".format(model_test_recall))
    print("- f1 Score: {}".format(model_test_f1Score))
    accuracy_list.append(accuracy)
    
    print('='*35)
    print('\n')
