In [44]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing as prp
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [45]:
import warnings
warnings.filterwarnings("ignore")

In [46]:
target = pd.DataFrame()

In [47]:
df = pd.read_csv("features_3_sec.csv")

In [48]:
df.shape

(9990, 60)

In [49]:
df.columns

Index(['filename', 'length', 'chroma_stft_mean', 'chroma_stft_var', 'rms_mean',
       'rms_var', 'spectral_centroid_mean', 'spectral_centroid_var',
       'spectral_bandwidth_mean', 'spectral_bandwidth_var', 'rolloff_mean',
       'rolloff_var', 'zero_crossing_rate_mean', 'zero_crossing_rate_var',
       'harmony_mean', 'harmony_var', 'perceptr_mean', 'perceptr_var', 'tempo',
       'mfcc1_mean', 'mfcc1_var', 'mfcc2_mean', 'mfcc2_var', 'mfcc3_mean',
       'mfcc3_var', 'mfcc4_mean', 'mfcc4_var', 'mfcc5_mean', 'mfcc5_var',
       'mfcc6_mean', 'mfcc6_var', 'mfcc7_mean', 'mfcc7_var', 'mfcc8_mean',
       'mfcc8_var', 'mfcc9_mean', 'mfcc9_var', 'mfcc10_mean', 'mfcc10_var',
       'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean', 'mfcc12_var', 'mfcc13_mean',
       'mfcc13_var', 'mfcc14_mean', 'mfcc14_var', 'mfcc15_mean', 'mfcc15_var',
       'mfcc16_mean', 'mfcc16_var', 'mfcc17_mean', 'mfcc17_var', 'mfcc18_mean',
       'mfcc18_var', 'mfcc19_mean', 'mfcc19_var', 'mfcc20_mean', 'mfcc20_var',
  

In [50]:
len(df["filename"].unique())

9990

In [51]:
df.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.0.wav,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,blues.00000.1.wav,66149,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,blues
2,blues.00000.2.wav,66149,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,blues.00000.3.wav,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,blues.00000.4.wav,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues


In [52]:
df.drop("filename", axis = 1, inplace=True)

In [53]:
encoder = prp.LabelEncoder()
X = df.drop("label", axis = 1)
y = encoder.fit_transform(df["label"])

In [54]:
standard_scaler = prp.StandardScaler()
max_abs_scaler = prp.MaxAbsScaler()
min_max_scaler = prp.MinMaxScaler()

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify = y)

In [56]:
X_train_std_scaled = standard_scaler.fit_transform(X_train)
X_test_std_scaled = standard_scaler.transform(X_test)

In [57]:
X_train_max_abs_scaled = max_abs_scaler.fit_transform(X_train)
X_test_max_abs_scaled = max_abs_scaler.transform(X_test)

In [58]:
X_train_min_max_scaled = min_max_scaler.fit_transform(X_train)
X_test_min_max_scaled = min_max_scaler.transform(X_test)

In [59]:
def model_assess(name, model, XTrain, XTest):
    model.fit(XTrain, y_train)
    preds = model.predict(XTest)
    print(f'{name} Accuracy:', round(accuracy_score(y_test, preds), 5), '\n')
    target.loc[name, "Accuracy"] = round(accuracy_score(y_test, preds), 5)

# Gradient boost with 1000 estimators had best result

In [60]:
from xgboost import XGBClassifier

In [61]:
model  = XGBClassifier(n_estimators = 1000, learning_rate = 0.01)

In [141]:
model_assess("xgb_no_scale", model, X_train, X_test)

xgb_no_scale Accuracy: 0.8642 



In [142]:
model_assess("xgb_std_scale", model, X_train_std_scaled , X_test_std_scaled)

xgb_std_scale Accuracy: 0.8642 



In [143]:
model_assess("xgb_max_abs_scale", model, X_train_max_abs_scaled, X_test_max_abs_scaled)

xgb_max_abs_scale Accuracy: 0.8642 



In [144]:
model_assess("xgb_min_max_scale", model, X_train_min_max_scaled, X_test_min_max_scaled)

xgb_min_max_scale Accuracy: 0.86453 



In [145]:
target

Unnamed: 0,Accuracy
xgb_no_scale,0.8642
xgb_std_scale,0.8642
xgb_max_abs_scale,0.8642
xgb_min_max_scale,0.86453


# The problms there's a chance that train and test set will have different part of the same song! thus probably we are getting this very high accuracy?

In [62]:
df = pd.read_csv("features_3_sec.csv")

In [63]:
# no idea why!!
df.drop("length", axis = 1, inplace = True)

In [64]:
df["filename"] = df["filename"].str[:-6]

In [65]:
len(df["filename"].unique())

1000

In [66]:
df["filename"][9989]

'rock.00099'

In [67]:
test = pd.DataFrame(columns=df.columns)

# Iterate through unique labels and filter rows for the 98th and 99th files
for label in df["label"].unique():
    label_df = df[df['label'] == label].iloc[-200:]
    test = pd.concat([test, label_df], ignore_index=True)

test.shape


(2000, 59)

In [68]:
len(test["filename"].unique())

200

In [69]:
test["filename"].value_counts()

filename
blues.00080    10
metal.00097    10
metal.00087    10
metal.00088    10
metal.00089    10
               ..
disco.00089    10
disco.00090    10
disco.00091    10
disco.00092    10
rock.00099     10
Name: count, Length: 200, dtype: int64

In [70]:
test["label"].value_counts()

label
blues        200
classical    200
country      200
disco        200
hiphop       200
jazz         200
metal        200
pop          200
reggae       200
rock         200
Name: count, dtype: int64

In [71]:
import pandas as pd

# Assuming you have the original 'df' and 'test_df' DataFrames

# Merge the 'df' and 'test_df' DataFrames with the '_merge' indicator
merged_df = df.merge(test, on=df.columns.to_list(), how='left', indicator=True)

# Select rows that are only in the 'df' by filtering on the '_merge' column
train_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])

# Reset the index of the 'train_df' if needed
train_df.reset_index(drop=True, inplace=True)

train_df.shape


(7990, 59)

In [72]:
encoder = prp.LabelEncoder()
X_train = train_df.drop(["filename", "label"], axis = 1)
y_train = encoder.fit_transform(train_df["label"])

In [73]:
X_test = test.drop(["filename", "label"], axis = 1)
y_test = encoder.fit_transform(test["label"])

In [74]:
min_max_scaler = prp.MinMaxScaler()
X_train_min_max_scaled = min_max_scaler.fit_transform(X_train)
X_test_min_max_scaled = min_max_scaler.transform(X_test)

In [75]:
y_test

array([0, 0, 0, ..., 9, 9, 9])

In [76]:
model_assess("xgb_no_scale_no_leak", model, X_train, X_test)

xgb_no_scale_no_leak Accuracy: 0.4725 



In [None]:
model_assess("xgb_min_max_scale_no_leak", model, X_train_min_max_scaled, X_test_min_max_scaled)

In [None]:
target