<a href="https://colab.research.google.com/github/Ajasahmed3182/DM_Programming-Assignment/blob/main/Pedda_Shali_DM1_programing_Assignment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import cv2
import os
import numpy as np
import warnings
from skimage.color import rgb2gray
from skimage import io, exposure, filters
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, SpectralClustering, BisectingKMeans
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.metrics import fowlkes_mallows_score, silhouette_score


In [3]:
warnings.filterwarnings('ignore')

# 1. Feature Extraction

In [4]:
def angle(dx, dy):
    """Calculate the angles between horizontal and vertical operators."""
    return np.mod(np.arctan2(dy, dx), np.pi)

In [5]:
path = "/content/drive/MyDrive/data_mining/processed"

In [6]:
class_names = ['Borzoi', 'Doberman', 'Komondor', 'Brittany_spaniel']

In [7]:
df = pd.DataFrame(columns = list(range(0,36))+['class'])
class_folders = os.listdir(path)
for class_ in class_folders:
  class_path = os.path.join(path,class_)
  for i,class_label in enumerate(class_names):
    if class_label.lower() == class_.split("-")[-1].lower():
      class_num = i
  for filename in os.listdir(class_path):
    img = io.imread(os.path.join(class_path,filename))
    gray_sacle = rgb2gray(img)
    angle_sobel = angle(filters.sobel_h(gray_sacle),
                    filters.sobel_v(gray_sacle))
    hist,bins = exposure.histogram(angle_sobel,nbins=36)
    df.loc[len(df)] = list(hist)+[class_num]


In [18]:
df['class'].unique()

array([1, 0, 2, 3])

In [19]:
scaler = StandardScaler()
scaler.fit(df[df.columns[:-1]])

In [20]:
data = df[df.columns[:-1]]

In [21]:
orginal_labels = np.array(df[df.columns[-1]])

In [22]:
scalled_data = scaler.transform(data)

# 2. Dimension Reduction

In [43]:
pca = PCA(n_components=2)

transformed_data = pca.fit_transform(scalled_data)

In [44]:
transformed_data

array([[-3.69815265, -6.76182877],
       [ 0.31867313, -4.80868179],
       [-2.91444473, -2.62878628],
       ...,
       [-4.2474439 ,  0.51654235],
       [-2.82551858, -2.1491763 ],
       [-3.64176208,  0.9916052 ]])

# 3. Clustering Algorithm

In [45]:
# K-means clustering with init='random'
kmeans_random = KMeans(n_clusters=4, init='random', random_state=6)
kmeans_random.fit(transformed_data)
kmeans_random_labels = kmeans_random.labels_

# K-means clustering with init='k-means++'
kmeans_kmeans_pp = KMeans(n_clusters=4, init='k-means++', random_state=6)
kmeans_kmeans_pp.fit(transformed_data)
kmeans_kmeans_pp_labels = kmeans_kmeans_pp.labels_

# Bisecting K-means clustering with init='random'
bisecting_kmeans_random = BisectingKMeans(n_clusters=4, init='random', random_state=6)
bisecting_kmeans_random.fit(transformed_data)
bisecting_kmeans_random_labels = bisecting_kmeans_random.labels_

# Spectral clustering with default parameters
spectral_clustering = SpectralClustering(n_clusters=4, random_state=6)
spectral_clustering.fit(transformed_data)
spectral_clustering_labels = spectral_clustering.labels_

In [46]:
# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=2)
dbscan.fit(data)
dbscan_labels = dbscan.labels_

# Agglomerative clustering with different linkage methods
agglomerative_single = AgglomerativeClustering(n_clusters=4, linkage='single')
agglomerative_single.fit(data)
agglomerative_single_labels = agglomerative_single.labels_

agglomerative_complete = AgglomerativeClustering(n_clusters=4, linkage='complete')
agglomerative_complete.fit(data)
agglomerative_complete_labels = agglomerative_complete.labels_

agglomerative_average = AgglomerativeClustering(n_clusters=4, linkage='average')
agglomerative_average.fit(data)
agglomerative_average_labels = agglomerative_average.labels_

agglomerative_ward = AgglomerativeClustering(n_clusters=4, linkage='ward')
agglomerative_ward.fit(data)
agglomerative_ward_labels = agglomerative_ward.labels_

# 4. Clustering Evaluations

In [47]:

# Calculate Fowlkes-Mallows index
fowlkes_mallows_scores = {
    'K-means (Random)': fowlkes_mallows_score(orginal_labels, kmeans_random_labels),
    'K-means (k-means++)': fowlkes_mallows_score(orginal_labels, kmeans_kmeans_pp_labels),
    'Bisecting K-means': fowlkes_mallows_score(orginal_labels, bisecting_kmeans_random_labels),
    'Spectral Clustering': fowlkes_mallows_score(orginal_labels, spectral_clustering_labels),
    'DBSCAN': fowlkes_mallows_score(orginal_labels, dbscan_labels),
    'Agglomerative (Single link)': fowlkes_mallows_score(orginal_labels, agglomerative_single_labels),
    'Agglomerative (Complete link)': fowlkes_mallows_score(orginal_labels, agglomerative_complete_labels),
    'Agglomerative (Group Average)': fowlkes_mallows_score(orginal_labels, agglomerative_average_labels),
    'Agglomerative (Ward)': fowlkes_mallows_score(orginal_labels, agglomerative_ward_labels)
}


In [48]:
# Calculate Silhouette Coefficient
silhouette_scores = {
    'K-means (Random)': silhouette_score(transformed_data, kmeans_random_labels),
    'K-means (k-means++)': silhouette_score(transformed_data, kmeans_kmeans_pp_labels),
    'Bisecting K-means': silhouette_score(transformed_data, bisecting_kmeans_random_labels),
    'Spectral Clustering': silhouette_score(transformed_data, spectral_clustering_labels),
    'DBSCAN': silhouette_score(transformed_data, dbscan_labels),
    'Agglomerative (Single link)': silhouette_score(transformed_data, agglomerative_single_labels),
    'Agglomerative (Complete link)': silhouette_score(transformed_data, agglomerative_complete_labels),
    'Agglomerative (Group Average)': silhouette_score(transformed_data, agglomerative_average_labels),
    'Agglomerative (Ward)': silhouette_score(transformed_data, agglomerative_ward_labels)
}

In [50]:
# Rank methods based on Fowlkes-Mallows index
ranked_methods_fm = sorted(fowlkes_mallows_scores.items(), key=lambda x: x[1], reverse=True)
print("Ranking based on Fowlkes-Mallows index:")
for method, score in ranked_methods_fm:
    print(f"{method}: {score}")

Ranking based on Fowlkes-Mallows index:
DBSCAN: 0.49139908796289333
Agglomerative (Single link): 0.4897299424611838
Agglomerative (Group Average): 0.3790557573595213
Agglomerative (Complete link): 0.3660507627491783
Spectral Clustering: 0.35303987669585707
Agglomerative (Ward): 0.33343061645559113
Bisecting K-means: 0.30280699891005425
K-means (Random): 0.28044750150233055
K-means (k-means++): 0.28044750150233055


In [51]:
# Rank methods based on Silhouette Coefficient
ranked_methods_silhouette = sorted(silhouette_scores.items(), key=lambda x: x[1], reverse=True)
print("\nRanking based on Silhouette Coefficient:")
for method, score in ranked_methods_silhouette:
    print(f"{method}: {score}")


Ranking based on Silhouette Coefficient:
DBSCAN: 0.48911304019625973
K-means (Random): 0.4466103585883277
K-means (k-means++): 0.4466103585883277
Bisecting K-means: 0.406502928455974
Agglomerative (Complete link): 0.4039297963034564
Agglomerative (Group Average): 0.40001651665026106
Spectral Clustering: 0.393308621985176
Agglomerative (Ward): 0.3251456959984067
Agglomerative (Single link): 0.1853364568067928
