In [0]:
import pandas as pd
import numpy as np
import re

In [0]:
from sklearn import  preprocessing
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


In [0]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU,Bidirectional
from tensorflow.keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D
from tensorflow.keras.layers import Activation,Flatten,Dropout

from tensorflow.keras.models import Sequential,model_from_json
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU,Bidirectional,Dropout,Conv2D,Flatten,Conv1D,GlobalAveragePooling1D
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [0]:
from google.colab import drive

drive.mount("/content/gdrive", force_remount=True)

In [0]:
drive_directory = "gdrive/My Drive/freesound-audio-tagging/"
df_train_og = pd.read_csv(drive_directory+'train.csv')

## Get files from original

In [0]:
file_to_tag = pd.Series(df_train_og['label'].values,index=df_train_og['fname']).to_dict()

In [0]:
def getTag(x):
    return (file_to_tag[x])

## Import Dataset

In [0]:
import pickle

pickle_in = open(drive_directory+"train_tab_feats.pkl","rb")
df_train = pickle.load(pickle_in)

pickle_in = open(drive_directory+"test_tab_feats.pkl","rb")
df_test = pickle.load(pickle_in)

In [0]:
total = pd.concat([df_train,df_test],ignore_index=True)

#### Need usable test file

In [0]:
#total['tag'] = total['fname'].apply(getTag)

In [0]:
df_train['tag'] = df_train['fname'].apply(getTag)

In [0]:
df_train_copy = df_train.drop(['fname','tag'], axis = 1)

## Reduce Dimensions and Make Train Val Sets

In [0]:
LDA = LinearDiscriminantAnalysis()
X = LDA.fit_transform(df_train_copy, df_train['tag'])

x_train, x_val, y_train, y_val = train_test_split(X,  df_train['tag'], shuffle = True, test_size = 0.2, random_state = 42)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_val = encoder.fit_transform(y_val)

## SGD Linear Model

In [0]:
SGD = SGDClassifier()
SGD.fit(x_train,y_train)
y_pred = SGD.predict(x_val)


In [0]:
new_index = list(encoder.classes_)
new_index.append('accuracy')
new_index.append('macro avg')
new_index.append('weighted avg')

In [0]:
report = classification_report(y_val, y_pred, output_dict=True)
df_sgd_first = pd.DataFrame(report).transpose()
df_sgd_first.index = new_index

In [0]:
df_sgd_first

## SVM

In [0]:
svm = SVC()
svm.fit(x_train,y_train)
y_pred = svm.predict(x_val)

In [0]:
new_index = list(encoder.classes_)
new_index.append('accuracy')
new_index.append('macro avg')
new_index.append('weighted avg')

In [0]:
report = classification_report(y_val, y_pred, output_dict=True)
df_svm_first = pd.DataFrame(report).transpose()
df_svm_first.index = new_index

In [0]:
df_svm_first

## Grid Search On SVM

In [0]:
def SVC_GridSearch(X, Y, X_test, Y_test):
    svc = SVC()
    parameters = {
        'C': (0.5,1,2),
        'kernel': ('rbf','linear','poly', 'sigmoid'),
        'shrinking': (True, False),
        'decision_function_shape': ('ovp','ovr'),
        

    }
    grid_search = GridSearchCV(svc, parameters, n_jobs=-1, verbose=0)
    grid_search.fit(X, Y)
    accuracy = grid_search.best_score_
    best_parameters = grid_search.best_estimator_.get_params()
    classifier = grid_search.best_estimator_
    y_pred = classifier.predict(X_test)
    test_accuracy = accuracy_score(y_pred, Y_test)
    return best_parameters, accuracy, test_accuracy

In [0]:
best_parameters, accuracy, test_accuracy = SVC_GridSearch(x_train, y_train, x_val, y_val)

In [0]:
bestSVM = SVC()
bestSVM.set_params(**best_parameters)


In [0]:
bestSVM.fit(x_train,y_train)
y_pred = bestSVM.predict(x_val)

In [0]:
new_index = list(encoder.classes_)
new_index.append('accuracy')
new_index.append('macro avg')
new_index.append('weighted avg')

In [0]:
report = classification_report(y_val, y_pred, output_dict=True)
df_svm = pd.DataFrame(report).transpose()
df_svm.index = new_index

In [0]:
df_svm

## Vanilla Neural Network

In [0]:
# Set the input and output sizes
input_size = 40 #NUMBER INPUTS HERE#
output_size = 41 #NUMBER OUTPUTS HERE#


#DEFINE HIDDEN LAYER SIZE
#CAN HAVE MULTIPLE DIFFERENT SIZED LAYERS IF NEEDED
#50 NICE START POINT FOR BEING TIME EFFICIENT BUT STILL RELATIVELY COMPLEX
hidden_layer_size = 100
  

#MODEL SPECIFICATIONS
model = tf.keras.Sequential([
    # tf.keras.layers.Dense is basically implementing: output = activation(dot(input, weight) + bias)
    # it takes several arguments, but the most important ones for us are the hidden_layer_size and the activation function
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 1st hidden layer
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 2nd hidden layer
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 3rd hidden layer
    tf.keras.layers.Dropout(0.2),


    # POTENTIALLY MULTIPLE MORE LAYERS HERE #
    # NO SINGLE ACTIVATION NECESSARILY BEST (AT THIS STAGE I DO NOT FULLY UNDERSTAND DIFFERENCES, TRY DIFFERENT VARIATIONs)
    
    # FINAL LAYER MUST TAKE OUTPUT SIZE
    #FOR CLASSIFICATION PROBLEMS USE SOFTMAX AS ACTIVATION
    tf.keras.layers.Dense(output_size, activation='softmax') # output layer
])


#COMPILE MODEL GIVING IT OPTIMIZER LOSS FUNCTION AND METRIC OF INTEREST
# MOST TIMES USE ADAM FOR OPTIMIZER (LOOK AT OTHERS THOUGH) 
# lOSS FUNCTION - MANY DIFFERENT VARIATIONS sparse_categorical_crossentropy IS BASICALLY MIN SUM OF SQUARES
# TO NOW I AM ONLY INTERESTED IN ACCURACY AT EACH LEVEL (HAVE NOT LOOKED AT OTHER OPTIONS`)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


###                            ### 
###                            ###
###          TRAINING          ###
###                            ###
###                            ###

# SET SIZE OF BATCHES (FOR SHUFFLING IN PARTS WHEN OVERALL SIZE TO BIG)
batch_size = 128

# SET MAXIMUM NUMBER OF EPOCHS (JUST SO DOESNT RUN ENDLESSLY)
max_epochs = 100

# SET EARLY STOPPING FUNCTION
# PATIENCE EQUAL 0 (DEFAULT) => STOPS AS SOON AS FOLLOWING EPOCH HAS REDUCED LOSS
# PATIENCE EQUAL N => STOPS AFTER N SUBSEQUENT INCREASING LOSSES
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)



###                            ### 
###                            ###
###         FIT MODEL          ###
###                            ###
###                            ###

model.fit(x_train, # train inputs
          y_train, # train targets
          batch_size=batch_size, # batch size
          epochs=max_epochs, # epochs that we will train for (assuming early stopping doesn't kick in)
          callbacks=[early_stopping], # early stopping
          validation_data=(x_val, y_val), # validation data
          verbose = 1 # shows some information for each epoch so we can analyse
          )  

In [0]:
y_pred = model.predict(x_val)
y_pred = np.argmax(y_pred, axis=1)
vanilla_nn = classification_report(y_val, y_pred)
print(vanilla_nn)