# Getting the outputs 

In [225]:
from keras.applications import InceptionV3
from keras.applications.inception_v3 import preprocess_input
from keras import Model
from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input as prep_res
import numpy as np
import pandas as pd
import os

In [391]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [90]:
def list_images(directory):
    """
    Get all the images and labels in directory/label/*.jpg
    """
    labels = os.listdir(directory)
    # Sort the labels so that training and validation get them in the same order
    labels.sort()

    files_and_labels = []
    for label in labels:
        for f in os.listdir(os.path.join(directory, label)):
            files_and_labels.append((os.path.join(directory, label, f), label))

    filenames, labels = zip(*files_and_labels)
    filenames = list(filenames)
    labels = list(labels)
    unique_labels = list(set(labels))

    label_to_int = {}
    for i, label in enumerate(unique_labels):
        label_to_int[label] = i

    labels = [label_to_int[l] for l in labels]

    return filenames, labels
filenames, labels = list_images('pics')
labels = np.array(labels)
# {1:'Tomato', 0:'Watermelon'}

In [91]:
print(labels[1], labels[-1])

1 0


In [92]:
# in labels first are 1s then zeros
# lets switch 
labels[labels == 1] = 2
labels[labels == 0] = 1
labels[labels == 2] = 0

# ResNet

In [224]:
from keras.applications import ResNet50
res = ResNet50(include_top=False, pooling = 'max')

In [201]:
res.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, None, None, 3 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, None, None, 3 0           input_3[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, None, None, 6 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, None, None, 6 256         conv1[0][0]                      
__________________________________________________________________________________________________
activation

In [228]:
img_path = filenames[0]
img = image.load_img(img_path, target_size=(299, 299))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = prep_res(x)
output = v3_model.predict(x)
df_res = pd.DataFrame.from_records(output)

In [229]:
df_res.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,36.265331,27.80975,52.319809,56.731041,63.273926,76.603882,18.907892,83.929825,1.826793,65.99752,...,97.668381,18.480446,0.0,58.693497,13.709708,102.132011,52.732021,0.0,49.176529,14.295607


In [230]:
%%time
for i in range(1,len(filenames)):
    if i % 50 == 0:
        print(i, end = ' ')
    img_path = filenames[i]
    img = image.load_img(img_path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = prep_res(x)
    output = res.predict(x)
    df_res = pd.concat([df_res, pd.DataFrame.from_records(output)],copy=False)    

50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 2000 2050 2100 2150 2200 2250 2300 2350 2400 2450 2500 2550 CPU times: user 4h 18min 6s, sys: 5min 59s, total: 4h 24min 6s
Wall time: 1h 20min 53s


In [264]:
df_res.to_csv('resnet_all.csv', index=False)

## Kmeans

In [234]:
y_res = KMeans(n_clusters=2, max_iter = 500,).fit_predict(df_res)
print(accuracy_score(y_res, labels))
confusion_matrix(y_res, labels)

0.7334622823984526


array([[1105,  562],
       [ 127,  791]])

## Aglomerative

In [360]:
ag_res = AgglomerativeClustering(n_clusters=2)
ag_res.fit(df_res)
print(1-accuracy_score(ag_res.labels_, labels))
confusion_matrix(ag_res.labels_, labels)

0.7133462282398453


array([[ 133,  745],
       [1099,  608]])

## Mean shift puts everything in one class

In [267]:
bandwidth_r = estimate_bandwidth(df_res, quantile=0.5)
ms_res = MeanShift(bandwidth=bandwidth_r, bin_seeding=True)
ms_res.fit(df)

MeanShift(bandwidth=262.3235610003285, bin_seeding=True, cluster_all=True,
     min_bin_freq=1, n_jobs=1, seeds=None)

In [268]:
np.unique(ms.labels_)

array([0])

## DBSCAN tried different models but puts everything in one class

In [237]:
from sklearn.cluster import DBSCAN
epsis = [0.01, 0.2, 0.5, 1, 2, 5]
samples = [5, 10, 20, 50, 100]
models_res = []
for i in epsis:
    for j in samples:
        db = DBSCAN(eps=i, min_samples=j).fit(df_res)
        if len(np.unique(db.labels_)) > 1:
            models_res.append(db)

In [344]:
models_res

[]

## HDBSCAN 

### I found parameters which create 4 clusters, two of which are very small
### and it gave 70% accuracy

In [252]:
import hdbscan
min_clust = [3, 5, 7, 9, 10]
min_samps = [3, 4, 5]
hd_res = []
for i in min_clust:
    for j in min_samps:   
        hdb = hdbscan.HDBSCAN(min_cluster_size = i, min_samples = j).fit(df_res)
        if len(np.unique(hdb.labels_)) > 1:
            hd_res.append(hdb)

In [353]:
hdb_labels = labels.copy()

In [354]:
hdb_labels[hdb_labels == 1] = -1
hdb_labels[hdb_labels == 0] = 1

In [355]:
print(accuracy_score(hdb_labels, hd_res[4].labels_))

0.6947775628626692


In [321]:
hd_res[4]

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, cluster_selection_method='eom',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(cachedir=None),
    metric='euclidean', min_cluster_size=5, min_samples=4, p=None,
    prediction_data=False)

## PCA

In [373]:
pc = PCA(.90)
pc.fit(df_res)

PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [374]:
df_res_pc = pc.transform(df_res)

## Kmeans with pca

In [376]:
y_res_pc = KMeans(n_clusters=2, max_iter = 500,).fit_predict(df_res_pc)
print(accuracy_score(y_res_pc, labels))
confusion_matrix(y_res_pc, labels)

0.73036750483559


array([[1105,  570],
       [ 127,  783]])

## Agglomerative with PCA gave highest 0.8 score

In [375]:
ag_res_pc = AgglomerativeClustering(n_clusters=2)
ag_res_pc.fit(df_res_pc)
print(1-accuracy_score(ag_res_pc.labels_, labels))
confusion_matrix(ag_res_pc.labels_, labels)

0.8


array([[ 237, 1073],
       [ 995,  280]])

## HDBSCAN with pca

In [332]:
import hdbscan
min_clust = [3, 5, 7, 9, 10]
min_samps = [3, 4, 5]
hd_res_pca = []
for i in min_clust:
    for j in min_samps:   
        hdb = hdbscan.HDBSCAN(min_cluster_size = i, min_samples = j).fit(df_res_pc)
        if len(np.unique(hdb.labels_)) > 1:
            hd_res_pca.append(hdb)

In [339]:
for i in hd_res_pca:
    a = i.labels_
    a = pd.Series(a)
    print(a.value_counts())

 1    2411
-1     168
 2       3
 0       3
dtype: int64
 3    1314
-1    1253
 0       7
 1       5
 4       3
 2       3
dtype: int64
 1    1576
-1     998
 0       6
 2       5
dtype: int64
 2    1570
-1     997
 1       7
 0       6
 3       5
dtype: int64
 1    1495
-1    1078
 0       7
 2       5
dtype: int64
 1    1576
-1     998
 0       6
 2       5
dtype: int64
 1    1677
-1     901
 0       7
dtype: int64
 0    1616
-1     962
 1       7
dtype: int64
-1    2302
 0     276
 1       7
dtype: int64
-1    2355
 1     205
 2      16
 0       9
dtype: int64
-1    2496
 0      74
 1      15
dtype: int64
-1    2500
 0      70
 1      15
dtype: int64
-1    2364
 0     205
 1      16
dtype: int64
-1    2496
 0      74
 1      15
dtype: int64
-1    2500
 0      70
 1      15
dtype: int64


In [343]:
#Second model seems good
res_pca_lab = labels.copy()
res_pca_lab[res_pca_lab == 0] = 3
res_pca_lab[res_pca_lab == 1] = -1
print(accuracy_score(res_pca_lab, hd_res_pca[1].labels_))

0.6889748549323017


# Inception

## Getting the model and creating the dataframe

In [381]:
v3_model = InceptionV3(weights = 'imagenet', include_top=False,
                  input_shape=(299, 299, 3), pooling = 'max')

In [382]:
v3_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 299, 299, 3)  0                                            
__________________________________________________________________________________________________
conv2d_95 (Conv2D)              (None, 149, 149, 32) 864         input_3[0][0]                    
__________________________________________________________________________________________________
batch_normalization_95 (BatchNo (None, 149, 149, 32) 96          conv2d_95[0][0]                  
__________________________________________________________________________________________________
activation_144 (Activation)     (None, 149, 149, 32) 0           batch_normalization_95[0][0]     
__________________________________________________________________________________________________
conv2d_96 

In [383]:
img_path = filenames[0]
img = image.load_img(img_path, target_size=(299, 299))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
output = v3_model.predict(x)
df = pd.DataFrame.from_records(output)

In [384]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,2.234013,0.68993,1.806582,3.89794,2.438401,3.724379,1.633544,1.819523,7.394964,0.855273,...,0.0,1.144702,0.186906,1.193506,1.021934,0.747638,2.481066,1.560249,0.959986,2.852096


In [385]:
%%time
for i in range(1,len(filenames)):
    if i % 50 == 0:
        print(i, end = ' ')
    img_path = filenames[i]
    img = image.load_img(img_path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    output = v3_model.predict(x)
    df = pd.concat([df, pd.DataFrame.from_records(output)],copy=False)    

50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 2000 2050 2100 2150 2200 2250 2300 2350 2400 2450 2500 2550 CPU times: user 2h 24min 11s, sys: 2min 42s, total: 2h 26min 54s
Wall time: 44min 29s


In [386]:
df = df.reset_index()
df.drop(axis=1, columns=['index'], inplace=True)

In [387]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,2.234013,0.68993,1.806582,3.89794,2.438401,3.724379,1.633544,1.819523,7.394964,0.855273,...,0.0,1.144702,0.186906,1.193506,1.021934,0.747638,2.481066,1.560249,0.959986,2.852096
1,1.604853,0.721352,1.530728,2.249464,2.154183,4.091249,1.921209,1.604266,1.696673,1.099681,...,0.800902,1.30837,0.174971,1.158611,0.279367,1.251449,1.897869,2.022413,1.472782,2.525035
2,1.957087,1.100931,1.840896,1.312423,1.653404,1.628108,1.855281,0.857786,1.765392,0.514823,...,0.793892,1.035413,0.005582,0.234481,1.855576,0.901277,0.305462,0.754643,0.0,1.240004
3,2.648656,1.77738,4.834164,1.086262,2.771671,3.432098,0.659691,1.542099,1.815955,2.99941,...,0.695709,0.7483,0.238948,1.377401,1.653249,3.181559,4.447573,2.414617,1.442229,3.202769
4,1.772838,0.648294,2.663591,3.251882,7.77736,0.403369,1.596817,7.726373,0.827431,3.77125,...,0.868185,4.170955,1.206849,2.676036,1.166288,2.81876,1.707134,1.940928,1.457944,1.077599


In [388]:
df.to_csv('inception_full.csv', index=False)

## Kmeans

In [390]:
%%time
y_pred = KMeans(n_clusters=2, max_iter=500).fit_predict(df)
print(accuracy_score(y_pred, labels))
confusion_matrix(y_pred, labels)

0.6858800773694391
CPU times: user 6.1 s, sys: 2.43 s, total: 8.53 s
Wall time: 6.38 s


## Affinity propogation

In [131]:
%%time
af = AffinityPropagation(damping = 0.9,max_iter=500).fit(df)
print('Affinity prop found',len(np.unique(af.labels_)),'clusters')

Affinity prop found 378 clusters
CPU times: user 27.9 s, sys: 891 ms, total: 28.8 s
Wall time: 27.1 s


## Mean Shift

### Mean shift put everything into 1 cluster though I tried different parameters

In [141]:
%%time
bandwidth = estimate_bandwidth(df, quantile=0.2)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(df)

CPU times: user 50.2 s, sys: 161 ms, total: 50.4 s
Wall time: 50.4 s


In [142]:
np.unique(ms.labels_)

array([0])

## Aglomerative clustering

In [150]:
%%time
ag = AgglomerativeClustering(n_clusters=2,)
ag.fit(df)

CPU times: user 8.37 s, sys: 37.6 ms, total: 8.41 s
Wall time: 8.41 s


In [151]:
print(accuracy_score(ag.labels_, labels))
confusion_matrix(ag.labels_, labels)

0.6073500967117988


array([[1006,  789],
       [ 226,  564]])

## DBSCAN puts everything in one cluster

In [392]:
%%time
from sklearn.cluster import DBSCAN
epsis = [0.5, 1, 2, 5, 10]
samples = [3, 10, 20, 50, 100]
models = []
for i in epsis:
    for j in samples:
        db = DBSCAN(eps=i, min_samples=j).fit(df)
        if len(np.unique(db.labels_)) > 1:
            models.append(db)

CPU times: user 6min 47s, sys: 798 ms, total: 6min 48s
Wall time: 6min 48s


In [394]:
models

[]

## HDBSCAN

In [396]:
%%time
import hdbscan
min_clust = [3, 10, 50]
min_samps = [5, 10, 15, 20, 50]
hds = []
for i in min_clust:
    for j in min_samps:   
        hdb = hdbscan.HDBSCAN(min_cluster_size = i, min_samples = j,).fit(df)
        if len(np.unique(hdb.labels_)) > 1:
            hds.append(hdb)

CPU times: user 11min 37s, sys: 1.81 s, total: 11min 39s
Wall time: 11min 37s


In [398]:
for i in hd_res_pca:
    a = i.labels_
    a = pd.Series(a)
    print(a.value_counts())

 1    2411
-1     168
 2       3
 0       3
dtype: int64
 3    1314
-1    1253
 0       7
 1       5
 4       3
 2       3
dtype: int64
 1    1576
-1     998
 0       6
 2       5
dtype: int64
 2    1570
-1     997
 1       7
 0       6
 3       5
dtype: int64
 1    1495
-1    1078
 0       7
 2       5
dtype: int64
 1    1576
-1     998
 0       6
 2       5
dtype: int64
 1    1677
-1     901
 0       7
dtype: int64
 0    1616
-1     962
 1       7
dtype: int64
-1    2302
 0     276
 1       7
dtype: int64
-1    2355
 1     205
 2      16
 0       9
dtype: int64
-1    2496
 0      74
 1      15
dtype: int64
-1    2500
 0      70
 1      15
dtype: int64
-1    2364
 0     205
 1      16
dtype: int64
-1    2496
 0      74
 1      15
dtype: int64
-1    2500
 0      70
 1      15
dtype: int64


In [404]:
hd_lab = labels.copy()
# hd_lab[hd_lab == 0] = 3
hd_lab[hd_lab == 0] = -1
print(1- accuracy_score(hd_lab, hds[4].labels_))

0.5361702127659574


# Lets apply dimensionality reduction and try again

In [405]:
pca = PCA(.90)
pca.fit(df)

PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [406]:
print('There are', pca.n_components_, 'components left with 90% variance explained')

There are 498 components left with 90% variance explained


In [407]:
df_pc = pca.transform(df)

## Kmeans pca

In [408]:
y_predd = KMeans(n_clusters=2, max_iter=500).fit_predict(df_pc)
print(accuracy_score(y_pred, labels))
confusion_matrix(y_predd, labels)

0.6858800773694391


array([[ 209,  750],
       [1023,  603]])

## Mean shift pca

In [409]:
bandwidth = estimate_bandwidth(df, quantile=0.2)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(df_pc)
print(np.unique(ms.labels_))

[0]


## Aglomerative clustering with pca

In [410]:
agg = AgglomerativeClustering(n_clusters=2)
agg.fit(df_pc)

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None, n_clusters=2,
            pooling_func=<function mean at 0x7f4a24117950>)

In [411]:
print(accuracy_score(agg.labels_, labels))
confusion_matrix(agg.labels_, labels)

0.6224371373307543


array([[1173,  917],
       [  59,  436]])

## DBSCAN with pca puts everything in one cluster

In [412]:
%%time
epsis = [0.5, 1, 2, 5, 10]
samples = [3, 10, 20, 50, 100]
models_pca = []
for i in epsis:
    for j in samples:
        db = DBSCAN(eps=i, min_samples=j).fit(df_pc)
        if len(np.unique(db.labels_)) > 1:
            models_pca.append(db)

CPU times: user 36 s, sys: 196 ms, total: 36.2 s
Wall time: 35.8 s


In [415]:
models_pca

[]

## HDBSCAN with pca 

In [417]:
min_clust = [3, 5, 7, 9, 10]
min_samps = [3, 4, 5, 10]
hds_pca = []
for i in min_clust:
    for j in min_samps:   
        hdb = hdbscan.HDBSCAN(min_cluster_size = i, min_samples = j).fit(df)
        if len(np.unique(hdb.labels_)) > 1:
            hds_pca.append(hdb)

In [419]:
for i in hds_pca:
    a = i.labels_
    a = pd.Series(a)
    print(a.value_counts())

 0    1988
-1     594
 1       3
dtype: int64
 0    1585
-1     993
 1       7
dtype: int64
 0    1559
-1    1020
 1       6
dtype: int64
-1    2433
 0     147
 1       5
dtype: int64
 0    1615
-1     962
 1       8
dtype: int64
 0    1585
-1     993
 1       7
dtype: int64
 0    1559
-1    1020
 1       6
dtype: int64
-1    2433
 0     147
 1       5
dtype: int64
 0    1615
-1     962
 1       8
dtype: int64
 0    1585
-1     993
 1       7
dtype: int64
-1    2403
 1     167
 0      15
dtype: int64
-1    2528
 0      45
 1      12
dtype: int64
-1    2361
 1     188
 2      26
 0      10
dtype: int64
-1    2391
 1     177
 0      17
dtype: int64
-1    2403
 1     167
 0      15
dtype: int64
-1    2528
 0      45
 1      12
dtype: int64
-1    2361
 1     188
 2      26
 0      10
dtype: int64
-1    2391
 1     177
 0      17
dtype: int64
-1    2403
 1     167
 0      15
dtype: int64
-1    2528
 0      45
 1      12
dtype: int64


In [430]:
hd_labs = labels.copy()
# hd_labs[hd_labs == 0] = 3
hd_labs[hd_labs == 1] = -1
print(accuracy_score(hd_labs, hds_pca[2].labels_))

0.6630560928433269
