In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import xgboost as xgb
from xgboost import XGBClassifier, XGBRFClassifier

# Load the dataset
data = pd.read_csv("music_genre.csv")

data

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,32894.0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,-1.0,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,4-Apr,0.759,Electronic
1,46652.0,Thievery Corporation,The Shining Path,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.00200000000001,4-Apr,0.531,Electronic
2,30097.0,Dillon Francis,Hurricane,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic
3,62177.0,Dubloadz,Nitro,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,4-Apr,0.270,Electronic
4,24907.0,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,4-Apr,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,58878.0,BEXEY,GO GETTA,59.0,0.03340,0.913,-1.0,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.02799999999999,4-Apr,0.330,Hip-Hop
50001,43557.0,Roy Woods,Drama (feat. Drake),72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.04299999999999,4-Apr,0.113,Hip-Hop
50002,39767.0,Berner,Lovin' Me (feat. Smiggz),51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,4-Apr,0.395,Hip-Hop
50003,57944.0,The-Dream,Shawty Is Da Shit,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.88600000000001,4-Apr,0.354,Hip-Hop


In [40]:
data = data.dropna(axis=0)
data

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,32894.0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,-1.0,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,4-Apr,0.759,Electronic
1,46652.0,Thievery Corporation,The Shining Path,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.00200000000001,4-Apr,0.531,Electronic
2,30097.0,Dillon Francis,Hurricane,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic
3,62177.0,Dubloadz,Nitro,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,4-Apr,0.270,Electronic
4,24907.0,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,4-Apr,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,58878.0,BEXEY,GO GETTA,59.0,0.03340,0.913,-1.0,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.02799999999999,4-Apr,0.330,Hip-Hop
50001,43557.0,Roy Woods,Drama (feat. Drake),72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.04299999999999,4-Apr,0.113,Hip-Hop
50002,39767.0,Berner,Lovin' Me (feat. Smiggz),51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,4-Apr,0.395,Hip-Hop
50003,57944.0,The-Dream,Shawty Is Da Shit,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.88600000000001,4-Apr,0.354,Hip-Hop


In [41]:
def preprocess_inputs(df):
    df = df.copy()
    
    df = df.drop(['instance_id','artist_name','track_name','obtained_date'],axis=1)

    df['mode'] = df['mode'].replace({'Minor' : 0,
                                     'Major' : 1})

    embarked_dummies = pd.get_dummies(df.key)
    df = pd.concat([df, embarked_dummies], axis=1)
    df = df.drop('key',axis=1)
    
    df['tempo'] = df['tempo'].replace('?',np.nan)
    df["tempo"] = df["tempo"].astype("float")
    df['tempo'] = df['tempo'].fillna(df['tempo'].mean())
     
    df['music_genre'] = df['music_genre'].replace({'Electronic':0, 'Anime':1, 'Jazz':2, 'Alternative':3, 'Country':4, 'Rap':5,
                                                   'Blues':5, 'Rock':6, 'Classical':7, 'Hip-Hop':8})    
    
    y = df['music_genre']
    X = df.drop('music_genre',axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.7, shuffle=True, random_state=42)
    
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), index = X_train.index, columns = X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns = X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [42]:
# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
X_train, X_test, y_train, y_test = preprocess_inputs(data)

X_train

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode,speechiness,...,B,C,C#,D,D#,E,F,F#,G,G#
38099,0.624744,-0.685575,-0.659750,-1.769127,0.034470,-0.559072,2.383526,0.050821,0.748698,-0.569348,...,-0.286386,-0.354632,-0.348115,2.922667,-0.179965,-0.28463,-0.304004,-0.258104,-0.361336,-0.265025
40629,0.044676,1.960270,-0.112836,3.510208,-2.267897,1.739334,-0.916636,-3.854580,0.748698,1.646813,...,-0.286386,-0.354632,-0.348115,-0.342153,-0.179965,-0.28463,-0.304004,3.874400,-0.361336,-0.265025
49430,2.042687,-0.853560,1.047960,-0.137116,0.852430,-0.526194,-0.452215,0.725941,0.748698,0.753200,...,-0.286386,-0.354632,2.872615,-0.342153,-0.179965,-0.28463,-0.304004,-0.258104,-0.361336,-0.265025
35739,0.495840,1.643648,0.037844,0.275269,-0.276052,-0.558752,-0.630602,0.379938,0.748698,-0.530625,...,-0.286386,-0.354632,-0.348115,-0.342153,-0.179965,-0.28463,-0.304004,-0.258104,-0.361336,-0.265025
41713,-0.986555,2.013040,-1.407570,-0.117810,-1.970288,2.267845,0.015283,-2.461638,-1.335651,-0.391618,...,-0.286386,-0.354632,-0.348115,-0.342153,-0.179965,-0.28463,-0.304004,3.874400,-0.361336,-0.265025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11289,-0.084228,-0.684988,-0.252355,0.354643,0.617645,2.264772,-0.187710,0.277972,-1.335651,-0.358852,...,-0.286386,-0.354632,-0.348115,-0.342153,-0.179965,-0.28463,-0.304004,3.874400,-0.361336,-0.265025
44737,-0.148680,-0.824536,-1.574992,1.555802,0.477532,1.954426,-0.626296,0.399260,-1.335651,-0.459136,...,-0.286386,-0.354632,2.872615,-0.342153,-0.179965,-0.28463,-0.304004,-0.258104,-0.361336,-0.265025
38163,1.269263,0.447521,-0.882980,-0.892106,0.825922,-0.559062,-0.384551,-0.341132,0.748698,-0.130485,...,-0.286386,2.819824,-0.348115,-0.342153,-0.179965,-0.28463,-0.304004,-0.258104,-0.361336,-0.265025
860,-1.115459,-0.572705,-0.196547,0.059188,1.151591,0.267494,0.593503,0.990274,-1.335651,5.112047,...,-0.286386,-0.354632,-0.348115,-0.342153,-0.179965,-0.28463,-0.304004,-0.258104,-0.361336,3.773230


In [43]:
# Train a decision tree model
dtc = DecisionTreeClassifier(max_depth=10, random_state=42)
dtc.fit(X_train, y_train)
dtc_y_pred = dtc.predict(X_test)
dtc_accuracy = accuracy_score(y_test, dtc_y_pred)

print("Decision Tree Accuracy:", dtc_accuracy)



Decision Tree Accuracy: 0.5268666666666667


In [44]:
# Train a random forest model
rfc = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=42)
rfc.fit(X_train, y_train)
rfc_y_pred = rfc.predict(X_test)
rfc_accuracy = accuracy_score(y_test, rfc_y_pred)
print("Random Forest Accuracy:", rfc_accuracy)


Random Forest Accuracy: 0.5629333333333333


In [45]:
# Train a xgboost descision tree model
xgb_model = xgb.XGBClassifier()
xgb_clf = xgb.XGBClassifier(random_state=0)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("XGBoost DT Accuracy:",accuracy)

XGBoost DT Accuracy: 0.5741333333333334


In [46]:
# Train a XGBoost RF
xgb_model = xgb.XGBRFClassifier(n_estimators=500, max_depth=10, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Random Forest Accuracy:" ,accuracy_xgb)

XGBoost Random Forest Accuracy: 0.5702666666666667


In [47]:
# Train a LightGBM decision tree model
lgb_dt = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, random_state=42)
lgb_dt.fit(X_train, y_train)
lgb_dt_y_pred = lgb_dt.predict(X_test)
lgb_dt_accuracy = accuracy_score(y_test, lgb_dt_y_pred)
print("LightGBM Decision Tree Accuracy:", lgb_dt_accuracy)

LightGBM Decision Tree Accuracy: 0.5794


In [48]:
# Train a LightGBM random forest model
lgb_rf = lgb.LGBMClassifier(boosting_type='rf', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, random_state=42, bagging_fraction=0.8, bagging_freq=10)
lgb_rf.fit(X_train, y_train)
lgb_rf_y_pred = lgb_rf.predict(X_test)
lgb_rf_accuracy = accuracy_score(y_test, lgb_rf_y_pred)
print("LightGBM Random Forest Accuracy:", lgb_rf_accuracy)

LightGBM Random Forest Accuracy: 0.5428
