# Clustering for genedata

This notebook contains the code for the clustering of the genedata dataset.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## Auxiliar functions

In [2]:
result_folder = "result/"

Auxiliar function used for calculating the scores given the nmi.

In [3]:
def get_points(nmi):
    if (nmi >= 0.98):
        return 10
    elif (nmi >= 0.96):
        return 9
    elif (nmi >= 0.94):
        return 8
    elif (nmi >= 0.92):
        return 7
    elif (nmi >= 0.9):
        return 6
    elif (nmi >= 0.88):
        return 5
    elif (nmi >= 0.86):
        return 4
    elif (nmi >= 0.84):
        return 3
    elif(nmi >= 0.82):
        return 2
    elif (nmi >= 0.8):
        return 1
    else:
        return 0

In [4]:
def get_sorted_nmi(nmi_values, dtype):
    nmi_points = np.array(nmi_values, dtype=dtype)
    return np.sort(nmi_points, order='nmi')[::-1]

In [5]:
def log_labels(labels, file_name):
    with open(f"{result_folder}{file_name}", 'w') as f:
        for label in labels:
            f.write(str(int(label)) + "\n")

## Preprocessing

In [6]:
data = pd.read_csv("../../data/genedata.csv")

# Class is substracted in order to have coincident results.
data['class'] = data['class'].apply(lambda x: x-1)
data_classes = list(data['class'])

data

Unnamed: 0,id,class,f1,f2,f3,f4,f5,f6,f7,f8,...,f6991,f6992,f6993,f6994,f6995,f6996,f6997,f6998,f6999,f7000
0,1,4,4.8778,4.4118,9.0690,10.0966,11.1664,10.2457,8.2541,12.0844,...,11.4206,9.5440,8.5381,11.4735,10.1581,7.8083,8.7157,9.4212,9.9236,8.5223
1,2,3,8.9505,2.7079,10.0899,6.5899,9.5933,8.3250,9.9028,12.4863,...,13.0857,9.4303,11.0118,9.8609,8.9299,10.4229,10.5660,7.9997,9.9458,10.5250
2,3,4,4.7239,3.7181,9.2404,10.1070,10.0589,9.5977,8.3794,12.8728,...,8.2631,9.7016,7.7138,14.3444,9.2942,8.5537,8.7280,7.9019,11.4660,8.9878
3,4,4,7.6797,4.1321,9.3522,9.3646,10.2057,9.4130,8.8524,13.2288,...,10.7985,9.1055,8.4670,11.4066,10.0314,8.3365,9.1870,7.7253,10.1910,9.7299
4,5,0,6.1541,5.9960,9.5418,8.9614,9.6077,9.3430,9.9071,13.1981,...,12.2830,8.7680,9.1708,9.9139,9.4670,9.8127,9.2294,7.7958,9.2692,10.8166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
790,791,0,5.5051,5.5937,8.8036,9.8995,10.2081,7.0993,10.0181,13.7360,...,12.3530,9.2083,8.2600,10.0181,9.1594,9.1573,8.9010,7.5171,8.9912,10.5179
791,792,3,6.4445,6.3149,9.9281,5.7244,10.0598,8.3327,8.9625,12.7977,...,11.4967,9.3160,11.3185,9.7353,9.0437,9.8171,7.9135,8.8159,10.6293,10.6889
792,793,2,9.3204,8.1726,9.1752,9.9419,10.7564,9.7459,9.9507,12.5833,...,11.1193,9.0767,9.2110,10.0821,9.6255,10.9260,7.9507,7.5112,8.6089,10.1170
793,794,4,6.7255,5.5300,9.6569,9.7835,10.8310,9.3115,8.4064,12.3590,...,10.1283,9.3001,8.4043,10.9690,8.9199,9.2516,8.1096,8.3683,11.4727,9.8517


In [7]:
# Columns id and class are dropped from the dataframe.
preprocessed_data = data.drop(columns=['id', 'class'])

preprocessed_data

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f6991,f6992,f6993,f6994,f6995,f6996,f6997,f6998,f6999,f7000
0,4.8778,4.4118,9.0690,10.0966,11.1664,10.2457,8.2541,12.0844,6.8658,5.7480,...,11.4206,9.5440,8.5381,11.4735,10.1581,7.8083,8.7157,9.4212,9.9236,8.5223
1,8.9505,2.7079,10.0899,6.5899,9.5933,8.3250,9.9028,12.4863,7.5498,9.5101,...,13.0857,9.4303,11.0118,9.8609,8.9299,10.4229,10.5660,7.9997,9.9458,10.5250
2,4.7239,3.7181,9.2404,10.1070,10.0589,9.5977,8.3794,12.8728,8.8574,5.8791,...,8.2631,9.7016,7.7138,14.3444,9.2942,8.5537,8.7280,7.9019,11.4660,8.9878
3,7.6797,4.1321,9.3522,9.3646,10.2057,9.4130,8.8524,13.2288,9.2525,5.7487,...,10.7985,9.1055,8.4670,11.4066,10.0314,8.3365,9.1870,7.7253,10.1910,9.7299
4,6.1541,5.9960,9.5418,8.9614,9.6077,9.3430,9.9071,13.1981,8.8106,5.5387,...,12.2830,8.7680,9.1708,9.9139,9.4670,9.8127,9.2294,7.7958,9.2692,10.8166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
790,5.5051,5.5937,8.8036,9.8995,10.2081,7.0993,10.0181,13.7360,8.9059,6.4352,...,12.3530,9.2083,8.2600,10.0181,9.1594,9.1573,8.9010,7.5171,8.9912,10.5179
791,6.4445,6.3149,9.9281,5.7244,10.0598,8.3327,8.9625,12.7977,8.8633,7.3106,...,11.4967,9.3160,11.3185,9.7353,9.0437,9.8171,7.9135,8.8159,10.6293,10.6889
792,9.3204,8.1726,9.1752,9.9419,10.7564,9.7459,9.9507,12.5833,8.9300,7.8694,...,11.1193,9.0767,9.2110,10.0821,9.6255,10.9260,7.9507,7.5112,8.6089,10.1170
793,6.7255,5.5300,9.6569,9.7835,10.8310,9.3115,8.4064,12.3590,8.3744,6.3127,...,10.1283,9.3001,8.4043,10.9690,8.9199,9.2516,8.1096,8.3683,11.4727,9.8517


Standard scaler is applied to the dataframe

In [8]:
scaler = StandardScaler()

preprocessed_numpy_data = scaler.fit_transform(preprocessed_data)
preprocessed_data = pd.DataFrame(preprocessed_numpy_data, columns=preprocessed_data.columns)
preprocessed_data

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f6991,f6992,f6993,f6994,f6995,f6996,f6997,f6998,f6999,f7000
0,-1.849734,-0.274643,-0.315054,0.937330,1.972003,1.119269,-1.717150,-1.247727,-1.629593,-1.332300,...,-0.288341,1.054327,-0.799484,1.355662,1.597923,-2.406314,0.225441,2.346298,0.691937,-2.466704
1,0.739597,-1.480452,1.580076,-1.169588,-1.069910,-0.387009,0.295319,-0.540425,-0.955584,1.442237,...,1.332023,0.883171,0.560906,-0.143465,-0.920515,0.603803,2.517191,0.352801,0.715274,0.150446
2,-1.947580,-0.765558,0.003122,0.943578,-0.169576,0.611086,-1.564204,0.139776,0.332916,-1.235614,...,-3.361008,1.291568,-1.252801,4.024545,-0.173514,-1.548156,0.240676,0.215647,2.313332,-1.858384
3,-0.068349,-0.472580,0.210660,0.497525,0.114292,0.466238,-0.986841,0.766299,0.722244,-1.331784,...,-0.893727,0.394239,-0.838585,1.293469,1.338123,-1.798212,0.809185,-0.032015,0.973032,-0.888599
4,-1.038291,0.846457,0.562620,0.255272,-1.042064,0.411341,0.300568,0.712270,0.286799,-1.486658,...,0.550889,-0.113810,-0.451536,-0.094194,0.180814,-0.098704,0.861701,0.066854,0.004021,0.531512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
790,-1.450911,0.561759,-0.807724,0.818907,0.118933,-1.348245,0.436059,1.658919,0.380707,-0.825492,...,0.619008,0.548988,-0.952423,0.002674,-0.449923,-0.853248,0.454951,-0.323992,-0.288217,0.141168
791,-0.853661,1.072135,1.279722,-1.689603,-0.167836,-0.380970,-0.852448,0.007607,0.338729,-0.179887,...,-0.214286,0.711112,0.729573,-0.260226,-0.687167,-0.093638,-0.768150,1.497431,1.433780,0.364633
792,0.974771,2.386784,-0.117911,0.844382,1.179184,0.727309,0.353788,-0.369715,0.404455,0.232226,...,-0.581546,0.350886,-0.429429,0.062170,0.505820,1.183008,-0.722074,-0.332267,-0.690097,-0.382733
793,-0.675008,0.516680,0.776284,0.749211,1.323438,0.386638,-1.531246,-0.764459,-0.143029,-0.915835,...,-1.545921,0.687177,-0.873066,0.886662,-0.941021,-0.744683,-0.525264,0.869722,2.320375,-0.729430


## k-means clustering

In [9]:
from sklearn.cluster import KMeans

### Parameters

In [10]:
# Execution paramters of the algorithm
range_n_clusters = [2,3,4,5,6,10]
range_exec = range(0, 5)

# Parameters of the nmi metric.
nmi_metric = []
nmi_d_type = [('nmi', float), ('points', int), ('i', int), ('j', int)]

# Paramter for logging the result.
labels_result = np.zeros((len(range_n_clusters), len(range_exec), len(data)))

### Execution

In [11]:
for i, n_cluster in enumerate(range_n_clusters):
    for j, _ in enumerate(range_exec):
        km = KMeans(n_cluster)
        labels = km.fit_predict(preprocessed_data)
        
        # storing the labels for future file logging.
        labels_result[i,j] = list(labels)
        
        # Normalized mutual info metric
        nmi = normalized_mutual_info_score(data_classes, labels, average_method="geometric")
        nmi_metric.append((nmi, get_points(nmi), i, j))

### Results (NMI)

In [12]:
sorted_nmi = get_sorted_nmi(nmi_metric, nmi_d_type)

for execution in sorted_nmi:
    print(f"nmi {np.round(execution[0], 5)} | points {execution[1]} # k = {range_n_clusters[execution[2]]} | exec = {range_exec[execution[3]]}")

nmi 0.79724 | points 0 # k = 5 | exec = 2
nmi 0.79724 | points 0 # k = 5 | exec = 3
nmi 0.79325 | points 0 # k = 5 | exec = 4
nmi 0.78359 | points 0 # k = 5 | exec = 0
nmi 0.78141 | points 0 # k = 10 | exec = 0
nmi 0.77841 | points 0 # k = 6 | exec = 0
nmi 0.77275 | points 0 # k = 6 | exec = 3
nmi 0.77093 | points 0 # k = 5 | exec = 1
nmi 0.76569 | points 0 # k = 10 | exec = 4
nmi 0.76432 | points 0 # k = 6 | exec = 2
nmi 0.76279 | points 0 # k = 6 | exec = 1
nmi 0.76265 | points 0 # k = 10 | exec = 2
nmi 0.76221 | points 0 # k = 6 | exec = 4
nmi 0.76203 | points 0 # k = 10 | exec = 3
nmi 0.75906 | points 0 # k = 10 | exec = 1
nmi 0.73618 | points 0 # k = 3 | exec = 4
nmi 0.73618 | points 0 # k = 3 | exec = 0
nmi 0.70263 | points 0 # k = 4 | exec = 1
nmi 0.69301 | points 0 # k = 4 | exec = 3
nmi 0.69218 | points 0 # k = 4 | exec = 2
nmi 0.69144 | points 0 # k = 4 | exec = 4
nmi 0.69104 | points 0 # k = 4 | exec = 0
nmi 0.66391 | points 0 # k = 3 | exec = 1
nmi 0.66391 | points 0 # k = 

### Save file with best result

In [13]:
output_result = True

if output_result:
    best_labels = nmi_metric[0]
    
    # select the indices of the best result
    i, j = best_labels[2], best_labels[3]
    
    log_labels(labels_result[i,j], "k-means-best.txt")

## Agglomerative herarchical clustering

In [14]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

### Parameters

In [15]:
# Execution paramters of the algorithm
linkage_metrics = ['single', 'complete', 'average', 'centroid', 'median', 'ward']
range_n_clusters = range(2, 8)

# Parameters of the nmi metric.
nmi_metric = []
nmi_d_type = [('nmi', float), ('points', int), ('i', int), ('j', int)]

# Paramter for logging the result.
labels_result = np.zeros((len(linkage_metrics), len(range_n_clusters), len(data)))

### Execution

In [16]:
for i, linkage_metric in enumerate(linkage_metrics):
    for j, n_clusters in enumerate(range_n_clusters):
        X = linkage(preprocessed_data.to_numpy(), linkage_metric)
        labels = fcluster(X, n_clusters, criterion="maxclust")
        
        # storing the labels for future file logging.
        labels_result[i,j] = list(labels)
        
        # Normalized mutual info metric
        nmi = normalized_mutual_info_score(data_classes, labels, average_method="geometric")
        nmi_metric.append((nmi, get_points(nmi), i, j))

### Results (NMI)

In [17]:
sorted_nmi = get_sorted_nmi(nmi_metric, nmi_d_type)

for execution in sorted_nmi:
    print(f"nmi {np.round(execution[0], 5)} | points {execution[1]} # linkage = {linkage_metrics[execution[2]]} | k = {range_n_clusters[execution[3]]}")   

nmi 0.86682 | points 4 # linkage = ward | k = 7
nmi 0.83196 | points 2 # linkage = ward | k = 6
nmi 0.82946 | points 2 # linkage = ward | k = 5
nmi 0.75069 | points 0 # linkage = ward | k = 4
nmi 0.68164 | points 0 # linkage = ward | k = 3
nmi 0.54932 | points 0 # linkage = ward | k = 2
nmi 0.33259 | points 0 # linkage = complete | k = 7
nmi 0.30056 | points 0 # linkage = complete | k = 6
nmi 0.10565 | points 0 # linkage = complete | k = 5
nmi 0.07654 | points 0 # linkage = average | k = 7
nmi 0.07328 | points 0 # linkage = average | k = 6
nmi 0.04647 | points 0 # linkage = complete | k = 4
nmi 0.0369 | points 0 # linkage = median | k = 7
nmi 0.0369 | points 0 # linkage = median | k = 6
nmi 0.03468 | points 0 # linkage = single | k = 7
nmi 0.03462 | points 0 # linkage = centroid | k = 7
nmi 0.03275 | points 0 # linkage = average | k = 5
nmi 0.03215 | points 0 # linkage = centroid | k = 6
nmi 0.03215 | points 0 # linkage = centroid | k = 5
nmi 0.02996 | points 0 # linkage = single | k =

### Save file with best result 

In [18]:
output_result = True

if output_result:
    best_labels = nmi_metric[0]
    
    # select the indices of the best result
    i, j = best_labels[2], best_labels[3]
    
    log_labels(labels_result[i,j], "agglomerative-best.txt")

## Spectral clustering

In [19]:
from sklearn.cluster import SpectralClustering

### Paramters

In [20]:
# Execution paramters of the algorithm
affinity_types = ['nearest_neighbors', 'rbf']
range_n_clusters = [4, 5, 6]
range_gamma = [0.5, 1.0, 1.5]

# Parameters of the nmi metric.
nmi_metric = []
nmi_d_type = [('nmi', float), ('points', int), ('i', int), ('j', int), ('k', int)]

# Paramter for logging the result.
labels_result = np.zeros((len(affinity_types), len(range_n_clusters), len(range_gamma), len(data)))

### Execution

In [21]:
for i, affinity in enumerate(affinity_types):
    for j, n_clusters in enumerate(range_n_clusters):
        for k, gamma in enumerate(range_gamma):
            sc = SpectralClustering(n_clusters=n_clusters, affinity=affinity, gamma=gamma)
            fit_cluster = sc.fit(preprocessed_data)
            labels = fit_cluster.labels_
            
            # storing the labels for future file logging.
            labels_result[i,j, k] = list(labels)

            # Normalized mutual info metric
            nmi = normalized_mutual_info_score(data_classes, labels, average_method="geometric")
            nmi_metric.append((nmi, get_points(nmi), i, j, k))



### Results (NMI)

In [22]:
sorted_nmi = get_sorted_nmi(nmi_metric, nmi_d_type)

for execution in sorted_nmi:
    print(f"nmi {np.round(execution[0], 5)} | points {execution[1]} # affinity = {affinity_types[execution[2]]} | k = {range_n_clusters[execution[3]]} | gamma = {range_gamma[execution[4]]}")  

nmi 0.95138 | points 8 # affinity = nearest_neighbors | k = 5 | gamma = 1.5
nmi 0.95138 | points 8 # affinity = nearest_neighbors | k = 5 | gamma = 1.0
nmi 0.95138 | points 8 # affinity = nearest_neighbors | k = 5 | gamma = 0.5
nmi 0.89188 | points 5 # affinity = nearest_neighbors | k = 6 | gamma = 1.0
nmi 0.89188 | points 5 # affinity = nearest_neighbors | k = 6 | gamma = 1.5
nmi 0.89188 | points 5 # affinity = nearest_neighbors | k = 6 | gamma = 0.5
nmi 0.83636 | points 2 # affinity = nearest_neighbors | k = 4 | gamma = 1.5
nmi 0.83636 | points 2 # affinity = nearest_neighbors | k = 4 | gamma = 1.0
nmi 0.83636 | points 2 # affinity = nearest_neighbors | k = 4 | gamma = 0.5
nmi 0.01783 | points 0 # affinity = rbf | k = 5 | gamma = 1.0
nmi 0.01733 | points 0 # affinity = rbf | k = 4 | gamma = 1.0
nmi 0.01447 | points 0 # affinity = rbf | k = 4 | gamma = 1.5
nmi 0.01317 | points 0 # affinity = rbf | k = 6 | gamma = 0.5
nmi 0.01284 | points 0 # affinity = rbf | k = 6 | gamma = 1.0
nmi 0.

### Save file with best result

In [23]:
output_result = True

if output_result:
    best_labels = nmi_metric[0]
    
    # select the indices of the best result
    i, j, k = best_labels[2], best_labels[3], best_labels[4]
    
    log_labels(labels_result[i,j,k], "spectral-best.txt")