In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing as prp
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [44]:
import warnings
warnings.filterwarnings("ignore")

In [45]:
target = pd.DataFrame()

In [46]:
df = pd.read_csv("features_3_sec.csv")

In [47]:
df.shape

(9990, 60)

In [48]:
df.columns

Index(['filename', 'length', 'chroma_stft_mean', 'chroma_stft_var', 'rms_mean',
       'rms_var', 'spectral_centroid_mean', 'spectral_centroid_var',
       'spectral_bandwidth_mean', 'spectral_bandwidth_var', 'rolloff_mean',
       'rolloff_var', 'zero_crossing_rate_mean', 'zero_crossing_rate_var',
       'harmony_mean', 'harmony_var', 'perceptr_mean', 'perceptr_var', 'tempo',
       'mfcc1_mean', 'mfcc1_var', 'mfcc2_mean', 'mfcc2_var', 'mfcc3_mean',
       'mfcc3_var', 'mfcc4_mean', 'mfcc4_var', 'mfcc5_mean', 'mfcc5_var',
       'mfcc6_mean', 'mfcc6_var', 'mfcc7_mean', 'mfcc7_var', 'mfcc8_mean',
       'mfcc8_var', 'mfcc9_mean', 'mfcc9_var', 'mfcc10_mean', 'mfcc10_var',
       'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean', 'mfcc12_var', 'mfcc13_mean',
       'mfcc13_var', 'mfcc14_mean', 'mfcc14_var', 'mfcc15_mean', 'mfcc15_var',
       'mfcc16_mean', 'mfcc16_var', 'mfcc17_mean', 'mfcc17_var', 'mfcc18_mean',
       'mfcc18_var', 'mfcc19_mean', 'mfcc19_var', 'mfcc20_mean', 'mfcc20_var',
  

In [49]:
len(df["filename"].unique())

9990

In [66]:
df.head()

Unnamed: 0,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,3714.560359,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,66149,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,3869.682242,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,blues
2,66149,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,3997.63916,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,3568.300218,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,3469.992864,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues


In [51]:
df.drop("filename", axis = 1, inplace=True)

In [52]:
encoder = prp.LabelEncoder()
X = df.drop("label", axis = 1)
y = encoder.fit_transform(df["label"])

In [53]:
standard_scaler = prp.StandardScaler()
max_abs_scaler = prp.MaxAbsScaler()
min_max_scaler = prp.MinMaxScaler()

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify = y)

In [55]:
X_train_std_scaled = standard_scaler.fit_transform(X_train)
X_test_std_scaled = standard_scaler.transform(X_test)

In [56]:
X_train_max_abs_scaled = max_abs_scaler.fit_transform(X_train)
X_test_max_abs_scaled = max_abs_scaler.transform(X_test)

In [57]:
X_train_min_max_scaled = min_max_scaler.fit_transform(X_train)
X_test_min_max_scaled = min_max_scaler.transform(X_test)

In [58]:
def model_assess(name, model, XTrain, XTest):
    model.fit(XTrain, y_train)
    preds = model.predict(XTest)
    print(f'{name} Accuracy:', round(accuracy_score(y_test, preds), 5), '\n')
    target[name, "Accuracy"] = round(accuracy_score(y_test, preds), 5)

# Gradient boost with 1000 estimators had best result

In [59]:
from xgboost import XGBClassifier

In [60]:
model  = XGBClassifier(n_estimators = 1000, learning_rate = 0.01)

In [61]:
model_assess("xgb_no_scale", model, X_train, X_test)

xgb_no_scale Accuracy: 0.8642 



In [62]:
model_assess("xgb_std_scale", model, X_train_std_scaled , X_test_std_scaled)

xgb_std_scale Accuracy: 0.8642 



In [63]:
model_assess("xgb_max_abs_scale", model, X_train_max_abs_scaled, X_test_max_abs_scaled)

xgb_max_abs_scale Accuracy: 0.8642 



In [64]:
model_assess("xgb_min_max_scale", model, X_train_min_max_scaled, X_test_min_max_scaled)

xgb_min_max_scale Accuracy: 0.86453 



# The problms there's a chance that train and test set will have different part of the same song! thus probably we are getting this very high accuracy?

In [67]:
df = pd.read_csv("features_3_sec.csv")