# Spherical K-Means  

## Bird-song code

In [14]:
import numpy as np
import pandas as pd

import os
import glob

import librosa
import librosa.display
from skimage.io import imread
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from tqdm.notebook import tqdm


In [11]:
features = pd.read_csv('features_filtered.csv', encoding='latin')
features.head()

Unnamed: 0,id,gen,sp,ssp,en,cnt,loc,type,q,length,bird-seen,file
0,564895,Thryothorus,ludovicianus,,Carolina Wren,United States,"Lyons Township (near Willow Springs), Cook Co...","adult, sex uncertain, song",no score,0:16,no,//www.xeno-canto.org/564895/download
1,545775,Thryothorus,ludovicianus,,Carolina Wren,United States,"Valley Forge National Historical Park, Montgom...",call,no score,0:17,yes,//www.xeno-canto.org/545775/download
2,540857,Thryothorus,ludovicianus,,Carolina Wren,United States,"Slaterville Rd @ Honness (near Ithaca), Tompk...",song,no score,0:32,unknown,//www.xeno-canto.org/540857/download
3,540855,Thryothorus,ludovicianus,,Carolina Wren,United States,"Slaterville Rd @ Honness (near Ithaca), Tompk...",song,no score,0:52,unknown,//www.xeno-canto.org/540855/download
4,539275,Thryothorus,ludovicianus,,Carolina Wren,United States,Upper St. Clair Township (near Upper Saint Cl...,"call, song",no score,1:12,yes,//www.xeno-canto.org/539275/download


In [77]:
# We only need the id and the labels.
dataset = features[features.en.isin(selected_species)][['id', 'en']]

In [78]:
def load_images(dataset, gray=True):
    img_df = []
    for img_name in tqdm(dataset['id']):
        # defining the image path
        image_path = 'images/mel_spectrograms_8sec/' + str(img_name) + '.jpg'
        # reading the image
        img = imread(image_path)
#         # normalizing the pixel values
#         img /= 255.0
        # converting the type of pixel to float 32
        img = img.astype('float32')
        # appending the image into the list
        img_df.append(img.flatten())
    
    # converting the list to numpy array
    return np.array(img_df)

In [79]:
X, y = load_images(dataset, gray=True), dataset['en'].values
X.shape, y.shape

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2516.0), HTML(value='')))




((2516, 132480), (2516,))

# Using coclustering library and scikit-learn's PCA

In [None]:
from coclust.clustering.spherical_kmeans import SphericalKmeans
from scipy import sparse
from sklearn.decomposition import PCA

In [138]:
# PCA functions
def pca_reduction(parameter):
    '''Parameter should be percentage variance retained (e.g. 0.95) or number of components (e.g. 2)''' 
    pca_model = PCA(parameter)
    X_proj = pca_model.fit_transform(X)
    return pca_model, X_proj

def inv_transform(pca_model, reduced_X):
    '''Reconstructs the images using the reduced dataset using inverse transform to convert images back to their
    original dimension. Note that we are not reverting back to the original data, we're going back to the actual dimension
    of the original images so that we can visualize them. 
    ''' 
    X_inv_proj = pca_model.inverse_transform(reduced_X) 
    #reshaping as 2516 images of 128*1035 dimension 
    X_proj_img = np.reshape(X_inv_proj,(2516 , 128, 1035))
    return X_proj_img

def images_plot(images, n_dim, cmap):
    '''Dimensions are of shape (128*345*3) or (128*1035)'''
    if n_dim == 3:
        fig, axes = plt.subplots(5, 4, figsize=(15, 8),
                             subplot_kw={'xticks':[], 'yticks':[]},
                             gridspec_kw=dict(hspace=0.1, wspace=0.1))
        for i, ax in enumerate(axes.flat):
            ax.imshow(images[i].reshape(128, 345, 3), cmap=cmap)
    elif n_dim == 2:
        fig, axes = plt.subplots(5, 2, figsize=(20, 8),
                             subplot_kw={'xticks':[], 'yticks':[]},
                             gridspec_kw=dict(hspace=0.1, wspace=0.1))
        for i, ax in enumerate(axes.flat):
            ax.imshow(images[i].reshape(128, 1035), cmap=cmap)
    else:
        print('not enough or too many dimensions')

In [139]:
pca_model, X_proj = pca_reduction(0.98)
X_proj.shape

(2516, 1873)

In [140]:
X_transform = pca_model.inverse_transform(X_proj)

In [154]:
skm = SphericalKmeans(n_clusters=3, weighting=False) #weighting parameter is used for text documents and will apply TFIDF transform to matrix if set to True

In [155]:
skm.fit(sparse.csr_matrix(X))

 == New init == 
iteration: 0
2348.6596117826434
iteration: 1
2356.15912384467
iteration: 2
2358.5629991068417
iteration: 3
2359.1307430684633
iteration: 4
2359.355673620574
iteration: 5
2359.50280336586
iteration: 6
2359.6041268306076
iteration: 7
2359.704380136085
iteration: 8
2359.839511669731
iteration: 9
2359.920745143813
iteration: 10
2359.9728849370986
iteration: 11
2360.0129349106064
iteration: 12
2360.036333159087
iteration: 13
2360.0530135910417
iteration: 14
2360.0663372917807
iteration: 15
2360.070698419968
iteration: 16
2360.0724062592963
iteration: 17
2360.075476574593
iteration: 18
2360.0769030886486
iteration: 19
2360.080898182018


In [149]:
def cluster_distribution(model):
    return set([(label, model.labels_.count(label)) for label in model.labels_])

def cluster_birds_distribution(model, cluster):
    cluster = [elem[0] for elem in enumerate(skm.labels_) if elem[1] == cluster]
    birds_in_c = [elem[1] for elem in enumerate(y) if elem[0] in cluster]
    o_birds = sorted(list(set([(bird, birds_in_c.count(bird)) for bird in birds_in_c])), key=lambda x: x[1], reverse=True)
    for label in range(6):
        print(f'{label+1}. {o_birds[label][0]} - {o_birds[label][1]}')

In [156]:
indexes = [elem[0] for elem in enumerate(skm.labels_) if elem[1] == 0]
features.iloc[indexes].type.value_counts()[:5]

song           279
call           136
call, song      53
male, song      34
flight call     30
Name: type, dtype: int64

In [158]:
indexes = [elem[0] for elem in enumerate(skm.labels_) if elem[1] == 1]
features.iloc[indexes].type.value_counts()[:5]

song                 63
call                 54
flight call          26
call, song           17
call, flight call    16
Name: type, dtype: int64

In [159]:
for cluster in range(3):
    print(f'Cluster {cluster}: {list(cluster_distribution(skm))[cluster][1]} records')
    print(cluster_birds_distribution(skm, cluster))
    

Cluster 0: 1391 records
1. Northern Cardinal - 211
2. Carolina Wren - 195
3. Red Crossbill - 127
4. Red-winged Blackbird - 124
5. Common Yellowthroat - 89
6. Spotted Towhee - 85
None
Cluster 1: 294 records
1. Red Crossbill - 74
2. Red-winged Blackbird - 58
3. Spotted Towhee - 53
4. Northern Cardinal - 37
5. Common Yellowthroat - 37
6. Carolina Wren - 35
None
Cluster 2: 831 records
1. Spotted Towhee - 368
2. Red-winged Blackbird - 254
3. Common Yellowthroat - 228
4. Northern Cardinal - 186
5. Red Crossbill - 183
6. Carolina Wren - 172
None


In [162]:
set([(label, skm.labels_.count(label)) for label in skm.labels_])

{(0, 831), (1, 294), (2, 1391)}

<coclust.clustering.spherical_kmeans.SphericalKmeans at 0x2022594d748>