In [1]:
import numpy as np
import pandas as pd

from pydataset import data

from sklearn.cluster import KMeans, SpectralClustering

df = data("animals")

Salmon live in groups when young, but as adults they tend to be alone.

In [2]:
# Some google searching reveals that: 
# Frogs tend to live in groups
# Lions are not endangered
# Lobsters do not live in groups
# Salmon/Salamanders do live in groups
# Spiders are not endangered

#Accessing elements this way stops the 'SettingWithCopy' Warning.
# https://www.dataquest.io/blog/settingwithcopywarning/ 
df.loc['fro', 'gro'] = 2
df.loc['lio', 'end'] = 1
df.loc['lob', 'gro'] = 1
df.loc['sal', 'gro'] = 2
df.loc['spi', 'end'] = 1

In [3]:
df.describe()

Unnamed: 0,war,fly,ver,end,gro,hai
count,20.0,20.0,20.0,20.0,20.0,20.0
mean,1.5,1.2,1.7,1.3,1.65,1.45
std,0.512989,0.410391,0.470162,0.470162,0.48936,0.510418
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,1.0,1.0
50%,1.5,1.0,2.0,1.0,2.0,1.0
75%,2.0,1.0,2.0,2.0,2.0,2.0
max,2.0,2.0,2.0,2.0,2.0,2.0


In [10]:
best_cluster = None
best_inertia = float("inf")
# Try every possible cluster and see what the distances are. Save the best.
for i in range(1,21):
    clus = KMeans(n_clusters = i)
    clus.fit(df)
    inert = clus.inertia_
    print("{} clusters yield {}".format(i, inert))
    if inert < best_inertia:
        best_cluster = i
        best_inertia = inert
print("Best # of clusters {} yielded inertia of {}".format(best_cluster, best_inertia))

1 clusters yield 26.099999999999998
2 clusters yield 17.582417582417584
3 clusters yield 13.700000000000003
4 clusters yield 10.95
5 clusters yield 8.916666666666668
6 clusters yield 7.249999999999999
7 clusters yield 6.416666666666666
8 clusters yield 4.883333333333334
9 clusters yield 3.416666666666667
10 clusters yield 2.5833333333333335
11 clusters yield 1.9166666666666667
12 clusters yield 1.1666666666666667
13 clusters yield 0.5
14 clusters yield 0.0
15 clusters yield 0.0
16 clusters yield 0.0
17 clusters yield 0.0
18 clusters yield 0.0
19 clusters yield 0.0
20 clusters yield 0.0
Best # of clusters 14 yielded inertia of 0.0


Is this what we wanted to see? Obviously more clusters will yield less inertia (which is calculated as the sum of squared distances of samples to their closest cluster center)

### I'm not sure how to read SpectralClustering to know how good it is.

In [23]:
sp_clus = SpectralClustering(n_clusters=2, affinity='nearest_neighbors')
sp_clus.fit(df)
sp_clus.fit_predict(df)

array([1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0])

In [24]:
sp_clus.affinity_matrix_.toarray()

array([[ 1. ,  1. ,  0. ,  1. ,  0. ,  0. ,  0. ,  0. ,  0. ,  1. ,  1. ,
         1. ,  0. ,  1. ,  1. ,  0. ,  0. ,  1. ,  1. ,  0. ],
       [ 1. ,  1. ,  0. ,  1. ,  0. ,  0.5,  0. ,  0. ,  0. ,  1. ,  0. ,
         0. ,  0.5,  0. ,  0.5,  0. ,  0.5,  0.5,  1. ,  0. ],
       [ 0. ,  0. ,  1. ,  1. ,  1. ,  1. ,  0. ,  0.5,  0. ,  0. ,  0. ,
         0. ,  1. ,  1. ,  1. ,  1. ,  1. ,  0. ,  1. ,  0. ],
       [ 1. ,  1. ,  1. ,  1. ,  0. ,  0. ,  0. ,  0. ,  0. ,  1. ,  0. ,
         0.5,  0.5,  1. ,  1. ,  0. ,  0. ,  0. ,  1. ,  0. ],
       [ 0. ,  0. ,  1. ,  0. ,  1. ,  1. ,  0. ,  0.5,  1. ,  0. ,  1. ,
         0.5,  1. ,  0. ,  0. ,  1. ,  1. ,  0. ,  0. ,  1. ],
       [ 0. ,  0.5,  1. ,  0. ,  1. ,  1. ,  0.5,  0. ,  1. ,  0. ,  0. ,
         0.5,  1. ,  0. ,  0. ,  1. ,  1. ,  0.5,  0. ,  1. ],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0.5,  1. ,  1. ,  1. ,  0. ,  0. ,
         0.5,  0.5,  0.5,  0. ,  0. ,  0.5,  0.5,  0. ,  1. ],
       [ 0. ,  0. ,  0.5,  0. ,  0.5,  0.