In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances
from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler


import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("dataset/dataset_tissue.txt", sep=",")
data.head()

Unnamed: 0.1,Unnamed: 0,GSM11805.CEL.gz,GSM11814.CEL.gz,GSM11823.CEL.gz,GSM11830.CEL.gz,GSM12067.CEL.gz,GSM12075.CEL.gz,GSM12079.CEL.gz,GSM12098.CEL.gz,GSM12105.CEL.gz,...,GSM323527.CEL.gz,GSM323565.CEL.gz,GSM323566.CEL.gz,GSM323567.CEL.gz,GSM246492.CEL.gz,GSM246493.CEL.gz,GSM246494.CEL.gz,GSM307639.CEL.gz,GSM307640.CEL.gz,GSM307641.CEL.gz
0,1007_s_at,10.191267,10.509167,10.272027,10.252952,10.157605,9.966782,9.839348,9.945652,9.913031,...,11.797743,10.040886,11.285002,9.888693,9.661127,9.803686,10.509541,9.984502,9.937738,10.306781
1,1053_at,6.040463,6.696075,6.144663,6.575153,6.606701,6.060069,6.186596,5.927861,6.337478,...,6.157979,6.224848,6.170956,6.211522,6.270153,6.058488,6.345526,6.715984,6.836179,7.025547
2,117_at,7.447409,7.775354,7.696235,8.478135,8.116336,7.644452,8.009581,7.847192,7.98385,...,7.727192,7.573437,7.323547,7.228568,7.333568,7.486711,7.468406,7.120793,7.125811,7.407624
3,121_at,12.025042,12.007817,11.633279,11.075286,10.832528,11.705062,11.706145,11.75037,10.706184,...,10.443504,10.566722,10.250737,10.191332,9.703713,9.914632,9.909784,9.409933,9.587782,9.792904
4,1255_g_at,5.269269,5.180389,5.301714,5.372235,5.334905,5.253682,5.228794,5.155278,5.236442,...,5.263736,5.048867,5.062005,5.186962,5.158631,5.312712,5.341193,4.896124,5.296695,5.206251


In [4]:
data.shape

(22215, 190)

In [5]:
clase = pd.read_csv("dataset/clase.txt")
clase.head()

Unnamed: 0.1,Unnamed: 0,x
0,1,kidney
1,2,kidney
2,3,kidney
3,4,kidney
4,5,kidney


In [6]:
target = np.array(clase['x']).T
element_count = {}
for elem in list(target):
    if elem not in element_count:
        element_count[elem] = 1
    else:
        element_count[elem]+=1

for elem, count in element_count.items():
    print(f'{elem} : {count}')

kidney : 39
hippocampus : 31
cerebellum : 38
colon : 34
liver : 26
endometrium : 15
placenta : 6


In [7]:
print(np.unique(target))
print(f"Cantidad de clusters: {len(np.unique(target))}")

['cerebellum' 'colon' 'endometrium' 'hippocampus' 'kidney' 'liver'
 'placenta']
Cantidad de clusters: 7


In [8]:
X = np.array(data.iloc[:, 1:]).T

### PCA

In [9]:
pca = PCA(svd_solver="full", n_components =0.99999)
pca.fit(X)
X = pca.transform(X)
X.shape

(189, 184)

In [10]:
def printCluster(target, prediction):
    cluster_element = {}
    for i, element in zip(target, list(prediction)):
        if i not in cluster_element:
            cluster_element[i] =[element]
        else:
            if element not in cluster_element[i]:
                cluster_element[i].append(element)
    print(cluster_element)

# GMM

### Librería

In [11]:
from sklearn.mixture import GaussianMixture
gm = GaussianMixture(n_components=7)
gm.fit(X)

GaussianMixture(n_components=7)

In [12]:
prediction = gm.predict(X)

In [13]:
printCluster(target, prediction)

{'kidney': [3, 4], 'hippocampus': [2], 'cerebellum': [2], 'colon': [1], 'liver': [5, 0], 'endometrium': [6], 'placenta': [1]}


### Implementación

In [14]:
from scipy.stats import multivariate_normal
class GaussianMixtureModel:
    pass

# K-Means

### Librería

In [15]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=7, random_state=0)
kmeans.fit(X)

KMeans(n_clusters=7, random_state=0)

In [16]:
prediction = kmeans.predict(X)

In [17]:
printCluster(target, prediction)

{'kidney': [4], 'hippocampus': [2], 'cerebellum': [6, 2], 'colon': [3], 'liver': [0], 'endometrium': [1], 'placenta': [5]}


### Implementación

In [18]:
def Init_Centroide(dataset, k):
    centroides = dataset[np.random.choice(dataset.shape[0], k, replace=False), :]
    return centroides

In [19]:
def getMinDistance(dataset, centroide):
    m, n = dataset.shape
    store_index = []
    for row in dataset: # n
        store_index.append(np.argmin(euclidean_distances(centroide, [row])))
    return store_index

In [20]:
def getMeans(dataset, grupos, n):
    centroides = []
    for i in range(n):
        array = [dataset[k]  for k, e in enumerate(grupos) if i == e]
        size = len(array)
        centroides.append(np.sum(array, axis = 0) / size)
    return centroides
    

In [21]:
def kmean(dataset, k = 2, n_iter = 1000):
    centroides = Init_Centroide(dataset, k) # Selecciona k centroides
    grupos = None
    for i in range(n_iter):
        grupos = getMinDistance(dataset, centroides)
        temp = getMeans(dataset, grupos, k)
        diff = np.array(centroides) - np.array(temp)
        centroides = temp
    return centroides, grupos

In [22]:
centroides, labels = kmean(X, k = 7)

In [23]:
printCluster(target, labels)

{'kidney': [3, 5], 'hippocampus': [4], 'cerebellum': [0, 5, 4], 'colon': [2], 'liver': [1, 6, 5], 'endometrium': [3], 'placenta': [2]}


# DBSCAN

### Librería

In [24]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps= 90, min_samples= 3)
dbscan.fit(X)

DBSCAN(eps=90, min_samples=3)

In [25]:
prediction = dbscan.labels_

In [26]:
printCluster(target, prediction)

{'kidney': [0], 'hippocampus': [1], 'cerebellum': [1, -1, 5], 'colon': [2], 'liver': [3, -1], 'endometrium': [4], 'placenta': [6]}


### Implementación

In [34]:
def sequentialDBSCAN(database, radius, minPts):
    undefined = -1
    noise = -2
    label = [undefined for i in range(len(database))]
    m,n = database.shape
    c = 0
    for i, point in enumerate(X):
        if label[i] != undefined:
            continue
        tree = KDTree(X, metric='euclidean')
        neighbours = tree.query_radius(point.reshape(-1, len(point)), r = radius)
        N = len(neighbours[0])
        if N < minPts:
            label[i] = noise
            continue
        label[i] = c
        s = np.array([X[z] for z in list(neighbours[0])])
        k = neighbours[0]
        s = np.delete(s, np.where(s == point)[0][0], axis = 0)
        k = np.delete(k, np.where(k == i))
        j = 0
        cambios = False
        while j < len(s):
            e = s[j].reshape(-1, n)
            idx = k[j]
            j+=1
            if label[idx] == noise:
                label[idx] = c
            if label[idx] != undefined:
                continue
            tree = KDTree(X, metric='euclidean')
            neighbours = tree.query_radius(e, r = radius)
            label[idx] = c
            N = len(neighbours[0])
            if N < minPts:
                continue
            for z in list(neighbours[0]):
                if z not in k:
                    k = np.append(k, z)
                    s = np.append(s, np.array([database[z]]), axis= 0)
            cambios = True
        if cambios:
            c+=1
    return label

In [35]:
dbscanLabel = sequentialDBSCAN(X, radius = 90, minPts = 3)

In [36]:
printCluster(target, dbscanLabel)

{'kidney': [0], 'hippocampus': [1], 'cerebellum': [1, -2, 5], 'colon': [2], 'liver': [3, -2], 'endometrium': [4], 'placenta': [6]}


# Agglomerative Hierarchical Clustering

In [30]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, homogeneity_score

cluster = AgglomerativeClustering(n_clusters=7)
cluster.fit(X)

AgglomerativeClustering(n_clusters=7)

In [31]:
prediction = cluster.fit_predict(X)

In [32]:
printCluster(target, prediction)

{'kidney': [0, 6], 'hippocampus': [1], 'cerebellum': [4, 6, 1], 'colon': [5], 'liver': [3, 6], 'endometrium': [0], 'placenta': [2]}


In [33]:
print(f'Silhouette Score(n = 7): {silhouette_score(X, prediction)}')

Silhouette Score(n = 7): 0.32897621601997656
