In [22]:
# First will import the dataframe created previously containing the features of the bird song and some metadata for each clip
import pandas as pd
data = pd.read_csv('data_dropped')

In [23]:
birdsNIMarine = ['Common Starling', 'Eurasian Skylark', 'Eurasian Tree Sparrow', 'Northern Lapwing']
data = data[data.English_name.isin(birdsNIMarine)]
data.reset_index(inplace=True, drop=True)

In [24]:
data.English_name.value_counts()

Eurasian Skylark         1000
Eurasian Tree Sparrow    1000
Common Starling          1000
Northern Lapwing         1000
Name: English_name, dtype: int64

Data is perfectly balanced between bird species so no need to do anything with balancing here at this time. 

### Setting Labels 
The following section creates a column called label in the DataFrame and assigns a numerical value for each bird. 

This is done as some model require the labels to be supplied as numerical values. 

In [25]:
data['label'] = data.English_name

In [26]:
data.label.replace(value=0, to_replace ="Yellowhammer", inplace=True)
data.label.replace(value=1, to_replace ="Eurasian Skylark", inplace=True)
data.label.replace(value=2, to_replace ="Stock Dove", inplace=True)
data.label.replace(value=3, to_replace ="Eurasian Tree Sparrow", inplace=True)
data.label.replace(value=4, to_replace ="Common Whitethroat", inplace=True)
data.label.replace(value=5, to_replace ="Common Linnet", inplace=True)
data.label.replace(value=6, to_replace ="European Goldfinch", inplace=True)
data.label.replace(value=7, to_replace ="Common Starling", inplace=True)
data.label.replace(value=8, to_replace ="Northern Lapwing", inplace=True)
data.label.replace(value=9, to_replace ="Corn Bunting", inplace=True)
data.label.replace(value=10, to_replace ="European Turtle Dove", inplace=True)
data.label.replace(value=11, to_replace ="Grey Partridge", inplace=True)

The features and labels are then seperated out into features the model will be trained on (contained in data_num DataFrame) and labels (contained in data_labels pandas Series).

In [27]:
data_num = data.iloc[:,1:137]
data_labels = data.label

### PCA
The each clip of bird song contains 136 features, as we do not know what the important features are, principal component analysis was conducted on the features set for dimension reduction, which should retain most of the variability in the data but will reduce computational cost.

In [28]:
from sklearn.preprocessing import StandardScaler
import numpy as np

scaler = StandardScaler()

# Fit on training set only.
scaler.fit(data_num)
data_num = scaler.transform(data_num)


In [29]:
from sklearn.decomposition import PCA

pca = PCA(0.99)    # The amount of varibility from the original data to retain can be varied
pca.fit(data_num)
pca_components = pca.n_components_

In [30]:
pca_components # shows how many principal components are retained to keep to original variability

90

In [31]:
data_num = pca.transform(data_num)

### Train, Validation, Test Split

Data is then split into train, validation and test sets in proportion 60:20:20

In [32]:
for i in range(len(data_num)):
  if i == 0:
    X_train = np.array(data_num[i])  # features saved into X variables
    y_train = np.array(data_labels[i]) # labels saved into y variables
    continue
  if i == 3:
    X_valid = np.array(data_num[i])
    y_valid = np.array(data_labels[i])
    continue
  if i == 4:
    X_test = np.array(data_num[i])
    y_test = np.array(data_labels[i])
    continue
  
  if i%5 < 3:
    X_train = np.vstack((X_train, data_num[i]))
    y_train = np.vstack((y_train, data_labels[i]))

  if i%5 == 3:
    X_valid = np.vstack((X_valid, data_num[i]))
    y_valid = np.vstack((y_valid, data_labels[i]))

  if i%5 == 4:
    X_test = np.vstack((X_test, data_num[i]))
    y_test = np.vstack((y_test, data_labels[i]))

y_train, y_valid, y_test = y_train.flatten(), y_valid.flatten(), y_test.flatten() # y variables flattened as scikit learn classifiers require a one dimensional array for labels. 

In [33]:
# Results DataFrames created these will be used for evualtion of models. 

valid_results = pd.DataFrame()
valid_results["labels"] = y_valid
test_results = pd.DataFrame()
test_results["labels"] = y_test

# Models

### Support Vector Machine

In [34]:
from sklearn import svm
from time import time
training_time = {}
t0 = time()
clf = svm.SVC(gamma = 'auto', class_weight = 'balanced', kernel = 'rbf', C = 15).fit(X_train, y_train)
ttime = time() - t0
training_time['SVM'] = ttime # training time calcualted and added to training time dictionary
valid_results["SVM_preds"] = clf.predict(X_valid)
# SVM model trained on training data
# Vaidation data then ran through the model and predictions added to results DataFrame

# Accuracy taken from 
print("Accuracy of SVM on validation set:", round(100*(sum(valid_results.labels == valid_results.SVM_preds)/len(valid_results)), 2), "%")

Accuracy of SVM on validation set: 85.25 %


### Nearest Neighbour

In [35]:
from sklearn.neighbors import NearestCentroid

t0 = time()
clf = NearestCentroid().fit(X_train, y_train)
ttime = time() - t0
training_time['Nearest Neighbours'] = ttime 
valid_results["NN_preds"] = clf.predict(X_valid)

print("Accuracy of Nearest Neighbour on validation set:", round(100*(sum(valid_results.labels == valid_results.NN_preds)/len(valid_results)), 2), "%")

Accuracy of Nearest Neighbour on validation set: 48.38 %


### Decision Trees

In [36]:
from sklearn import tree

t0 = time()
clf = tree.DecisionTreeClassifier().fit(X_train, y_train)
ttime = time() - t0
training_time['Decision Tree'] = ttime
valid_results["DT_preds"] = clf.predict(X_valid)

print("Accuracy of Decision Tree on validation set:", round(100*(sum(valid_results.labels == valid_results.DT_preds)/len(valid_results)), 2), "%")

Accuracy of Decision Tree on validation set: 56.5 %


### Random Forest Classifier

In [37]:
from sklearn.ensemble import RandomForestClassifier

t0 = time()
clf = RandomForestClassifier(n_estimators = 500).fit(X_train, y_train)
ttime = time() - t0
training_time['Random Forest'] = ttime
valid_results["RF_preds"] = clf.predict(X_valid)

print("Accuracy of Random Forest on validation set:", round(100*(sum(valid_results.labels == valid_results.RF_preds)/len(valid_results)), 2), "%")

Accuracy of Random Forest on validation set: 76.75 %


### Extra Trees Classifier

In [38]:
from sklearn.ensemble import ExtraTreesClassifier

t0 = time()
clf = ExtraTreesClassifier(criterion = 'gini', max_features = 'sqrt', n_estimators=500, max_depth=None, min_samples_split=3, random_state=0).fit(X_train, y_train)
ttime = time() - t0
training_time['Extra Trees'] = ttime
valid_results["ETrees_preds"] = clf.predict(X_valid)

print("Accuracy of Extra Trees on validation set:", round(100*(sum(valid_results.labels == valid_results.ETrees_preds)/len(valid_results)), 2), "%")


Accuracy of Extra Trees on validation set: 79.12 %


# Results

Prints overall accuarcy for each model

In [39]:
print("Accuracy of SVM on validation set:", round(100*(sum(valid_results.labels == valid_results.SVM_preds)/len(valid_results)), 2), "%")
print("Accuracy of Nearest Neighbour on validation set:", round(100*(sum(valid_results.labels == valid_results.NN_preds)/len(valid_results)), 2), "%")
print("Accuracy of Decision Tree on validation set:", round(100*(sum(valid_results.labels == valid_results.DT_preds)/len(valid_results)), 2), "%")
print("Accuracy of Random Forest on validation set:", round(100*(sum(valid_results.labels == valid_results.RF_preds)/len(valid_results)), 2), "%")
print("Accuracy of Extra Trees on validation set:", round(100*(sum(valid_results.labels == valid_results.ETrees_preds)/len(valid_results)), 2), "%")

Accuracy of SVM on validation set: 85.25 %
Accuracy of Nearest Neighbour on validation set: 48.38 %
Accuracy of Decision Tree on validation set: 56.5 %
Accuracy of Random Forest on validation set: 76.75 %
Accuracy of Extra Trees on validation set: 79.12 %


Prints training time for each model

In [40]:
for model in training_time:
  print("Training time", model, "is:", round(training_time[model], 2), "seconds.")

Training time SVM is: 1.01 seconds.
Training time Nearest Neighbours is: 0.01 seconds.
Training time Decision Tree is: 0.4 seconds.
Training time Random Forest is: 9.28 seconds.
Training time Extra Trees is: 2.93 seconds.


Adding all results from validation set to DataFrame so they can be quickly and easily compared

In [41]:
import tabulate

model_comparison = pd.DataFrame()
models = ['Support Vector Machine', 'Nearest Neighbours', 'Decision Tree', 'Random Forest', 'Extra Trees']
accuracy = [round(100*(sum(valid_results.labels == valid_results.SVM_preds)/len(valid_results)), 2),
            round(100*(sum(valid_results.labels == valid_results.NN_preds)/len(valid_results)), 2),
            round(100*(sum(valid_results.labels == valid_results.DT_preds)/len(valid_results)), 2),
            round(100*(sum(valid_results.labels == valid_results.RF_preds)/len(valid_results)), 2),
            round(100*(sum(valid_results.labels == valid_results.ETrees_preds)/len(valid_results)), 2)
            ]
train_time = []
for model in training_time:
  train_time.append(round(training_time[model], 2))

model_comparison['Model'], model_comparison['Accuracy (%)'], model_comparison['Training Time (s)'] = models, accuracy, train_time
model_comparison.to_csv('model_comparison')
model_comparison

Unnamed: 0,Model,Accuracy (%),Training Time (s)
0,Support Vector Machine,85.25,1.01
1,Nearest Neighbours,48.38,0.01
2,Decision Tree,56.5,0.4
3,Random Forest,76.75,9.28
4,Extra Trees,79.12,2.93


SVM was found to return the best results.
Here is a further breakdown of the accuracy of the SVM model for each bird species. 

In [42]:
correctSVM = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0}
totalSVM = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0}

for index, row in valid_results.iterrows():
  totalSVM[row['labels']] += 1
  if row['labels'] == row['SVM_preds']:
    correctSVM[row['labels']] += 1
  
print("-"*10, "Accuracy for SVM", "-"*10)
print("Overall Accuracy:", round(100*(sum(valid_results.labels == valid_results.SVM_preds)/len(valid_results)), 2), "%")
print("")
#print("Yellowhammer accuracy:", round(100*correctSVM[0]/totalSVM[0], 1), "%")
print("Eurasian Skylark accuracy:", round(100*correctSVM[1]/totalSVM[1], 1), "%")
#print("Stock Dove accuracy:", round(100*correctSVM[2]/totalSVM[2], 1), "%")
print("Eurasian Tree Sparrow accuracy:", round(100*correctSVM[3]/totalSVM[3], 1), "%")
#print("Common Whitethroat accuracy:", round(100*correctSVM[4]/totalSVM[4], 1), "%")
#print("Common Linnet accuracy:", round(100*correctSVM[5]/totalSVM[5], 1), "%")
#print("European Goldfinch accuracy:", round(100*correctSVM[6]/totalSVM[6], 1), "%")
print("Common Starling accuracy:", round(100*correctSVM[7]/totalSVM[7], 1), "%")
print("Northern Lapwing accuracy:", round(100*correctSVM[8]/totalSVM[8], 1), "%")
#print("Corn Bunting accuracy:", round(100*correctSVM[9]/totalSVM[9], 1), "%")
#print("European Turtle Dove accuracy:", round(100*correctSVM[10]/totalSVM[10], 1), "%")
#print("Grey Partridge accuracy:", round(100*correctSVM[11]/totalSVM[11], 1), "%")

---------- Accuracy for SVM ----------
Overall Accuracy: 85.25 %

Eurasian Skylark accuracy: 80.1 %
Eurasian Tree Sparrow accuracy: 82.4 %
Common Starling accuracy: 87.4 %
Northern Lapwing accuracy: 91.3 %


## Here is the final results on the previously unseen testset data

In [43]:
correctSVM = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0}
totalSVM = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0}

test_results["SVM_preds"] = clf.predict(X_test)

for index, row in test_results.iterrows():
  totalSVM[row['labels']] += 1
  if row['labels'] == row['SVM_preds']:
    correctSVM[row['labels']] += 1
  
print("-"*10, "Accuracy for SVM Testset", "-"*10)
print("Overall Accuracy:", round(100*(sum(test_results.labels == test_results.SVM_preds)/len(test_results)), 2), "%")
print("")
#print("Yellowhammer accuracy:", round(100*correctSVM[0]/totalSVM[0], 1), "%")
print("Eurasian Skylark accuracy:", round(100*correctSVM[1]/totalSVM[1], 1), "%")
#print("Stock Dove accuracy:", round(100*correctSVM[2]/totalSVM[2], 1), "%")
print("Eurasian Tree Sparrow accuracy:", round(100*correctSVM[3]/totalSVM[3], 1), "%")
#print("Common Whitethroat accuracy:", round(100*correctSVM[4]/totalSVM[4], 1), "%")
#print("Common Linnet accuracy:", round(100*correctSVM[5]/totalSVM[5], 1), "%")
#print("European Goldfinch accuracy:", round(100*correctSVM[6]/totalSVM[6], 1), "%")
print("Common Starling accuracy:", round(100*correctSVM[7]/totalSVM[7], 1), "%")
print("Northern Lapwing accuracy:", round(100*correctSVM[8]/totalSVM[8], 1), "%")
#print("Corn Bunting accuracy:", round(100*correctSVM[9]/totalSVM[9], 1), "%")
#print("European Turtle Dove accuracy:", round(100*correctSVM[10]/totalSVM[10], 1), "%")
#print("Grey Partridge accuracy:", round(100*correctSVM[11]/totalSVM[11], 1), "%")

---------- Accuracy for SVM Testset ----------
Overall Accuracy: 80.88 %

Eurasian Skylark accuracy: 72.3 %
Eurasian Tree Sparrow accuracy: 78.1 %
Common Starling accuracy: 84.2 %
Northern Lapwing accuracy: 88.5 %
