In [1]:
# We import the usual libraries
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os  # to load the images from a folder
from sklearn.decomposition import PCA

In [2]:
# We import the function that implements k-means
from sklearn.cluster import KMeans, DBSCAN

In [3]:
# We import the data
data = []
# print(os.getcwd())
os.chdir("/home/maribel/ml-project-2-ml-plume-1/Automatic_Filtering")
folderSave = 'Save_3K'
filenames = os.listdir(folderSave)
for filename in filenames:
        img = cv2.imread(os.path.join(folderSave,filename), cv2.IMREAD_GRAYSCALE)
        data.append(img)
data = np.array(data)

In [4]:
print(data.shape)
transformed_data = data.reshape((data.shape[0], data.shape[1]*data.shape[2]))

pca = PCA(n_components = 100)
pca.fit(transformed_data)
trans_pca = pca.transform(transformed_data)
img_reduced = pca.inverse_transform(trans_pca)

(1000, 128, 160)


In [5]:
np.random.seed(1)

In [6]:
# We have 3 different clusters:
#  * Triangle with overflow
#  * Triangle without overflow
#  * Patatoid with overflow
#  * Bad Images
n = 4

In [None]:
# We classify the data with k-means
kmeans = KMeans(n_clusters=n,init='random')
kmeans.fit(img_reduced)
Z = kmeans.predict(img_reduced)

In [None]:
# We plot the resulting clusters
for i in range(0,n):
    
    row = np.where(Z==i)[0] # row in Z for elements of cluster i
    num = row.shape[0]      # number of elements for each cluster
    r = np.floor(num/20.)   # number of rows in the figure of the cluster
    
    print("cluster " + str(i))
    print(str(num) + " elements")
    
    plt.figure(figsize=(20,20), dpi=300)
    for k in range(0, num):
        plt.subplot(int(r+1), 20, k+1)
        image = data[row[k], ]
        image = image.reshape(160, 128)
        plt.imshow(image, cmap='gray')
        plt.axis('off')
    plt.show()

In [None]:
# holds the cluster id and the images { id: [images] }
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)

In [None]:
# print(f'Cluster 0: {groups[0]} \n')
# print(f'Cluster 1: {groups[1]} \n')
# print(f'Cluster 2: {groups[2]} \n')
# print(f'Cluster 3: {groups[3]} \n')

In [None]:
clustering = DBSCAN(eps=3, min_samples=2).fit(data)
# clustering = DBSCAN(eps=0.08, # default=0.5, The maximum distance between two samples for one to be considered as in the neighborhood of the other.
               #min_samples=3, # default=5, The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
               #metric='euclidean', # default='euclidean'. The metric to use when calculating distance between instances in a feature array. 
               #metric_params=None, # default=None, Additional keyword arguments for the metric function.
               #algorithm='auto', # {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’, The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors.
               #leaf_size=30, # default=30, Leaf size passed to BallTree or cKDTree.
               #p=None, # default=None, The power of the Minkowski metric to be used to calculate distance between points. If None, then p=2
               #n_jobs=None, # default=None, The number of parallel jobs to run. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors.
              #)
#clustering.fit(data)
print(clustering.labels_)

In [None]:
# We plot the resulting clusters
for i in range(0,n):
    
    row = np.where(clustering.labels_==i)[0] # row in Z for elements of cluster i
    num = row.shape[0]      # number of elements for each cluster
    r = np.floor(num/4.)   # number of rows in the figure of the cluster
    
    print("cluster " + str(i))
    print(str(num) + " elements")
    
    plt.figure(figsize=(4,4), dpi=300)
    for k in range(0, num):
        plt.subplot(int(r+1), 4, k+1)
        image = data[row[k], ]
        image = image.reshape(260, 370)
        plt.imshow(image, cmap='gray')
        plt.axis('off')
    plt.show()