In [63]:
import os
import time
import librosa
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import trange,tqdm


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import *
from sklearnex import patch_sklearn, config_context

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [64]:
path = os.getcwd() + "/STFT_npy_dataset/"
path

'd:\\Study\\Code\\Python\\Sound_Classification_Bee_Qeen_Queenless/STFT_npy_dataset/'

In [65]:
TRAINING_DIR = path+"/train"
VALIDATION_DIR = path + "/val"
TEST_DIR = path + "/test"

print(TRAINING_DIR)
print(VALIDATION_DIR)
print(TEST_DIR)

d:\Study\Code\Python\Sound_Classification_Bee_Qeen_Queenless/STFT_npy_dataset//train
d:\Study\Code\Python\Sound_Classification_Bee_Qeen_Queenless/STFT_npy_dataset//val
d:\Study\Code\Python\Sound_Classification_Bee_Qeen_Queenless/STFT_npy_dataset//test


In [66]:
def data_loader(path):
    X = []
    Y = []
    for folder in os.listdir(path):
        for file in os.listdir(os.path.join(path, folder)):
            feature = np.load(os.path.join(path, folder, file))
            label = folder
            X += feature.tolist()
            Y += [label]*feature.shape[0]
    return X, Y

In [67]:
X_train, Y_train = data_loader(TRAINING_DIR)
X_val, Y_val = data_loader(VALIDATION_DIR)
X_test, Y_test = data_loader(TEST_DIR)

In [68]:
print("There are {} training samples and {} training labels".format(len(X_train), len(Y_train)))
print("There are {} validation samples and {} validation labels".format(len(X_val), len(Y_val)))
print("There are {} testing samples and {} testing labels".format(len(X_test), len(Y_test)))

There are 14000 training samples and 14000 training labels
There are 2000 validation samples and 2000 validation labels
There are 4000 testing samples and 4000 testing labels


In [69]:
X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_val = np.array(Y_val)
Y_test = np.array(Y_test)

print("Shape of X_train: {}".format(X_train.shape))
print("Shape of X_val: {}".format(X_val.shape))
print("Shape of X_test: {}".format(X_test.shape))
print("Shape of Y_train: {}".format(Y_train.shape))
print("Shape of Y_val: {}".format(Y_val.shape))
print("Shape of Y_test: {}".format(Y_test.shape))

Shape of X_train: (14000, 1506)
Shape of X_val: (2000, 1506)
Shape of X_test: (4000, 1506)
Shape of Y_train: (14000,)
Shape of Y_val: (2000,)
Shape of Y_test: (4000,)


In [70]:
X_test = np.vstack((X_test, X_val))
Y_test = np.hstack((Y_test, Y_val))
Y_train = Y_train.reshape((Y_train.shape[0], 1))
Y_test = Y_test.reshape((Y_test.shape[0], 1))


print("Shape of X_train: {}".format(X_train.shape))
print("Shape of X_test: {}".format(X_test.shape))
print("Shape of Y_train: {}".format(Y_train.shape))
print("Shape of Y_test: {}".format(Y_test.shape))

Shape of X_train: (14000, 1506)
Shape of X_test: (6000, 1506)
Shape of Y_train: (14000, 1)
Shape of Y_test: (6000, 1)


In [71]:
X_train = X_train.astype('float')
X_test = X_test.astype('float')

In [72]:
# shuffle_index = np.random.permutation(len(X_train))
# X_train, Y_train = X_train[shuffle_index], Y_train[shuffle_index]
# shuffle_index = np.random.permutation(len(X_test))
# X_test, Y_test = X_test[shuffle_index], Y_test[shuffle_index]

In [73]:
label_encoder = LabelEncoder()
Y_train = label_encoder.fit_transform(Y_train)
Y_test = label_encoder.transform(Y_test)
np.unique(Y_test)

array([0, 1])

In [74]:
std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)
print(X_train.shape)
print(X_train[0:5,:])

(14000, 1506)
[[-0.30400822 -0.30666171 -0.18840707 ... -0.10915881  0.01779355
  -0.19861381]
 [-0.31615841 -0.34303248 -0.35732082 ... -0.34002899 -0.38492004
  -0.35259639]
 [-0.03307033 -0.09631587 -0.12944842 ... -0.21906953 -0.02559726
   0.13583233]
 [-0.31507581 -0.29338361 -0.28013159 ... -0.34906444 -0.33374036
  -0.3014045 ]
 [-0.38419051 -0.39317615 -0.35307367 ... -0.05273323 -0.16478354
  -0.32016134]]


In [75]:
SEED = 1337

In [76]:
classifiers = [['SVC',SVC(probability=True, random_state=SEED)],
                ['ExtraTreesClassifier', ExtraTreesClassifier(random_state=SEED)],
                ['LogisticRegression', LogisticRegression(random_state=SEED)],
              ['DecisionTreeClassifier',DecisionTreeClassifier(random_state=SEED)],
              ['KNeighborsClassifier', KNeighborsClassifier()],
              ['RandomForestClassifier', RandomForestClassifier(random_state=SEED)],
                ['XGBClassifier', XGBClassifier(random_state=SEED)]]


In [80]:
patch_sklearn()

def evaluate_classifiers(classifiers):
    models = []
    Accuracy_set = pd.DataFrame(index=None, columns=['Model','Accuracy(Train)','Accuracy(Test)','F1(Train)','F1(Test)', 'Precision(Train)','Precision(Test)', 'Recall(Train)','Recall(Test)', 'Log_loss(Train)','Log_loss(Test)', 'Train_Time(s)', 'Confusion_Matrix(Test)'])
    for i in tqdm(range(len(classifiers))):
        name = classifiers[i][0]
        model = classifiers[i][1]
        time_start = time.time()
        model.fit(X_train,Y_train)
        time_end = time.time()

        Y_train_predicted = model.predict(X_train)
        Y_test_predicited = model.predict(X_test)

        accuracy_train = accuracy_score(Y_train, Y_train_predicted)
        accuracy_test = accuracy_score(Y_test, Y_test_predicited)

        f1_Score_train = f1_score(Y_train, Y_train_predicted,average='micro')
        f1_Score_test = f1_score(Y_test, Y_test_predicited,average='micro')

        precision_score_train = precision_score(Y_train, Y_train_predicted,average='micro')
        precision_score_test = precision_score(Y_test, Y_test_predicited,average='micro')

        recall_score_train = recall_score(Y_train, Y_train_predicted,average='micro')
        recall_score_test = recall_score(Y_test, Y_test_predicited,average='micro')

        log_loss_train = log_loss(Y_train, model.predict_proba(X_train))
        log_loss_test = log_loss(Y_test, model.predict_proba(X_test))
        
        cf_matrix = confusion_matrix(Y_test, Y_test_predicited)

        train_time = time_end - time_start
        
        # store the models
        models.append((name,accuracy_test,model))

        Accuracy_set = Accuracy_set.append(pd.Series({'Model':name, 'Accuracy(Train)':accuracy_train,'Accuracy(Test)':accuracy_test,'F1(Train)':f1_Score_train,'F1(Test)':f1_Score_test,'Precision(Train)':precision_score_train,'Precision(Test)':precision_score_test,'Recall(Train)':recall_score_train,'Recall(Test)':recall_score_test,'Log_loss(Train)':log_loss_train,'Log_loss(Test)':log_loss_test,'Train_Time(s)':train_time, 'Confusion_Matrix(Test)':cf_matrix}),ignore_index=True)
    return Accuracy_set, models

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [81]:
Accuracy_set, models = evaluate_classifiers(classifiers)
Accuracy_set.sort_values(by='Accuracy(Test)').style.background_gradient(cmap= plt.cm.Blues)

  0%|          | 0/7 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy(Train),Accuracy(Test),F1(Train),F1(Test),Precision(Train),Precision(Test),Recall(Train),Recall(Test),Log_loss(Train),Log_loss(Test),Train_Time(s),Confusion_Matrix(Test)
4,KNeighborsClassifier,0.743857,0.650333,0.743857,0.650333,0.743857,0.650333,0.743857,0.650333,0.453264,2.810646,0.104395,[[2553 447]  [1651 1349]]
2,LogisticRegression,0.729786,0.674333,0.729786,0.674333,0.729786,0.674333,0.729786,0.674333,0.547366,0.675444,2.318786,[[2030 970]  [ 984 2016]]
3,DecisionTreeClassifier,1.0,0.689,1.0,0.689,1.0,0.689,1.0,0.689,0.0,11.209576,38.513188,[[2094 906]  [ 960 2040]]
0,SVC,0.783929,0.754833,0.783929,0.754833,0.783929,0.754833,0.783929,0.754833,0.563277,0.5131,40.392385,[[2042 958]  [ 513 2487]]
6,XGBClassifier,1.0,0.781167,1.0,0.781167,1.0,0.781167,1.0,0.781167,0.055869,0.490019,44.909262,[[2324 676]  [ 637 2363]]
1,ExtraTreesClassifier,1.0,0.788833,1.0,0.788833,1.0,0.788833,1.0,0.788833,0.0,0.453267,11.985169,[[2202 798]  [ 469 2531]]
5,RandomForestClassifier,1.0,0.792667,1.0,0.792667,1.0,0.792667,1.0,0.792667,0.139981,0.462886,1.68416,[[2267 733]  [ 511 2489]]


In [94]:
os.chdir(os.path.join(os.getcwd()))
import pickle
for index in range(len(models)):
    model_name = models[index][0]
    model = models[index][-1]
    pickle.dump(model, open(model_name + '.h', 'wb'))