# Clustering for genedata

This notebook contains the code for the clustering of the genedata dataset.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## Auxiliar functions

In [2]:
result_folder = "result/"

Auxiliar function used for calculating the scores given the nmi.

In [3]:
def get_points(nmi):
    if (nmi >= 0.98):
        return 10
    elif (nmi >= 0.96):
        return 9
    elif (nmi >= 0.94):
        return 8
    elif (nmi >= 0.92):
        return 7
    elif (nmi >= 0.9):
        return 6
    elif (nmi >= 0.88):
        return 5
    elif (nmi >= 0.86):
        return 4
    elif (nmi >= 0.84):
        return 3
    elif(nmi >= 0.82):
        return 2
    elif (nmi >= 0.8):
        return 1
    else:
        return 0

In [4]:
def get_sorted_nmi(nmi_values, dtype):
    nmi_points = np.array(nmi_values, dtype=dtype)
    return np.sort(nmi_points, order='nmi')[::-1]

In [5]:
def log_labels(labels, file_name):
    with open(f"{result_folder}{file_name}", 'w') as f:
        for label in labels:
            f.write(str(int(label)) + "\n")

## Preprocessing

In [6]:
data = pd.read_csv("../../data/genedata.csv")

# Class is substracted in order to have coincident results.
data['class'] = data['class'].apply(lambda x: x-1)
data_classes = list(data['class'])

data

Unnamed: 0,id,class,f1,f2,f3,f4,f5,f6,f7,f8,...,f6991,f6992,f6993,f6994,f6995,f6996,f6997,f6998,f6999,f7000
0,1,4,4.8778,4.4118,9.0690,10.0966,11.1664,10.2457,8.2541,12.0844,...,11.4206,9.5440,8.5381,11.4735,10.1581,7.8083,8.7157,9.4212,9.9236,8.5223
1,2,3,8.9505,2.7079,10.0899,6.5899,9.5933,8.3250,9.9028,12.4863,...,13.0857,9.4303,11.0118,9.8609,8.9299,10.4229,10.5660,7.9997,9.9458,10.5250
2,3,4,4.7239,3.7181,9.2404,10.1070,10.0589,9.5977,8.3794,12.8728,...,8.2631,9.7016,7.7138,14.3444,9.2942,8.5537,8.7280,7.9019,11.4660,8.9878
3,4,4,7.6797,4.1321,9.3522,9.3646,10.2057,9.4130,8.8524,13.2288,...,10.7985,9.1055,8.4670,11.4066,10.0314,8.3365,9.1870,7.7253,10.1910,9.7299
4,5,0,6.1541,5.9960,9.5418,8.9614,9.6077,9.3430,9.9071,13.1981,...,12.2830,8.7680,9.1708,9.9139,9.4670,9.8127,9.2294,7.7958,9.2692,10.8166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
790,791,0,5.5051,5.5937,8.8036,9.8995,10.2081,7.0993,10.0181,13.7360,...,12.3530,9.2083,8.2600,10.0181,9.1594,9.1573,8.9010,7.5171,8.9912,10.5179
791,792,3,6.4445,6.3149,9.9281,5.7244,10.0598,8.3327,8.9625,12.7977,...,11.4967,9.3160,11.3185,9.7353,9.0437,9.8171,7.9135,8.8159,10.6293,10.6889
792,793,2,9.3204,8.1726,9.1752,9.9419,10.7564,9.7459,9.9507,12.5833,...,11.1193,9.0767,9.2110,10.0821,9.6255,10.9260,7.9507,7.5112,8.6089,10.1170
793,794,4,6.7255,5.5300,9.6569,9.7835,10.8310,9.3115,8.4064,12.3590,...,10.1283,9.3001,8.4043,10.9690,8.9199,9.2516,8.1096,8.3683,11.4727,9.8517


In [7]:
# Columns id and class are dropped from the dataframe.
preprocessed_data = data.drop(columns=['id', 'class'])

preprocessed_data

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f6991,f6992,f6993,f6994,f6995,f6996,f6997,f6998,f6999,f7000
0,4.8778,4.4118,9.0690,10.0966,11.1664,10.2457,8.2541,12.0844,6.8658,5.7480,...,11.4206,9.5440,8.5381,11.4735,10.1581,7.8083,8.7157,9.4212,9.9236,8.5223
1,8.9505,2.7079,10.0899,6.5899,9.5933,8.3250,9.9028,12.4863,7.5498,9.5101,...,13.0857,9.4303,11.0118,9.8609,8.9299,10.4229,10.5660,7.9997,9.9458,10.5250
2,4.7239,3.7181,9.2404,10.1070,10.0589,9.5977,8.3794,12.8728,8.8574,5.8791,...,8.2631,9.7016,7.7138,14.3444,9.2942,8.5537,8.7280,7.9019,11.4660,8.9878
3,7.6797,4.1321,9.3522,9.3646,10.2057,9.4130,8.8524,13.2288,9.2525,5.7487,...,10.7985,9.1055,8.4670,11.4066,10.0314,8.3365,9.1870,7.7253,10.1910,9.7299
4,6.1541,5.9960,9.5418,8.9614,9.6077,9.3430,9.9071,13.1981,8.8106,5.5387,...,12.2830,8.7680,9.1708,9.9139,9.4670,9.8127,9.2294,7.7958,9.2692,10.8166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
790,5.5051,5.5937,8.8036,9.8995,10.2081,7.0993,10.0181,13.7360,8.9059,6.4352,...,12.3530,9.2083,8.2600,10.0181,9.1594,9.1573,8.9010,7.5171,8.9912,10.5179
791,6.4445,6.3149,9.9281,5.7244,10.0598,8.3327,8.9625,12.7977,8.8633,7.3106,...,11.4967,9.3160,11.3185,9.7353,9.0437,9.8171,7.9135,8.8159,10.6293,10.6889
792,9.3204,8.1726,9.1752,9.9419,10.7564,9.7459,9.9507,12.5833,8.9300,7.8694,...,11.1193,9.0767,9.2110,10.0821,9.6255,10.9260,7.9507,7.5112,8.6089,10.1170
793,6.7255,5.5300,9.6569,9.7835,10.8310,9.3115,8.4064,12.3590,8.3744,6.3127,...,10.1283,9.3001,8.4043,10.9690,8.9199,9.2516,8.1096,8.3683,11.4727,9.8517


Standard scaler is applied to the dataframe

In [8]:
scaler = MinMaxScaler()

preprocessed_numpy_data = scaler.fit_transform(preprocessed_data)
preprocessed_data = pd.DataFrame(preprocessed_numpy_data, columns=preprocessed_data.columns)
preprocessed_data

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f6991,f6992,f6993,f6994,f6995,f6996,f6997,f6998,f6999,f7000
0,0.335782,0.343255,0.378382,0.754261,0.845787,0.631436,0.129691,0.319934,0.414975,0.363775,...,0.544407,0.855680,0.558119,0.726229,0.634065,0.289164,0.527104,0.787516,0.628788,0.238904
1,0.719286,0.158421,0.622623,0.403934,0.477776,0.366592,0.341669,0.420782,0.510401,0.866145,...,0.762780,0.845486,0.719820,0.572450,0.321323,0.739809,0.834024,0.553463,0.632672,0.612869
2,0.321290,0.268005,0.419388,0.755300,0.586698,0.542084,0.145801,0.517766,0.692825,0.381281,...,0.130308,0.869810,0.504236,1.000000,0.414086,0.417639,0.529144,0.537360,0.898646,0.325827
3,0.599621,0.312914,0.446135,0.681132,0.621041,0.516616,0.206616,0.607096,0.747946,0.363868,...,0.462820,0.816366,0.553471,0.719849,0.601803,0.380203,0.605281,0.508282,0.675572,0.464400
4,0.455964,0.515105,0.491495,0.640852,0.481144,0.506963,0.342222,0.599393,0.686296,0.335826,...,0.657508,0.786107,0.599477,0.577504,0.458087,0.634637,0.612315,0.519890,0.514294,0.667320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
790,0.394851,0.471465,0.314888,0.734570,0.621602,0.197581,0.356494,0.734367,0.699591,0.455540,...,0.666689,0.825583,0.539940,0.587441,0.379762,0.521674,0.557841,0.474001,0.465655,0.611544
791,0.483309,0.549699,0.583913,0.317469,0.586909,0.367654,0.220772,0.498921,0.693648,0.572436,...,0.554387,0.835239,0.739868,0.560473,0.350300,0.635395,0.394038,0.687852,0.752257,0.643475
792,0.754117,0.751218,0.403790,0.738806,0.749871,0.562519,0.347828,0.445122,0.702953,0.647055,...,0.504892,0.813784,0.602105,0.593544,0.498447,0.826522,0.400209,0.473030,0.398768,0.536683
793,0.509770,0.464555,0.519032,0.722981,0.767323,0.502620,0.149273,0.388839,0.625441,0.439182,...,0.374925,0.833813,0.549372,0.678119,0.318777,0.537927,0.426567,0.614154,0.899818,0.487144


## k-means clustering

In [9]:
from sklearn.cluster import KMeans

### Parameters

In [10]:
# Execution paramters of the algorithm
range_n_clusters = [2,3,4,5,6,10]
range_exec = range(0, 5)

# Parameters of the nmi metric.
nmi_metric = []
nmi_d_type = [('nmi', float), ('points', int), ('i', int), ('j', int)]

# Paramter for logging the result.
labels_result = np.zeros((len(range_n_clusters), len(range_exec), len(data)))

### Execution

In [11]:
for i, n_cluster in enumerate(range_n_clusters):
    for j, _ in enumerate(range_exec):
        km = KMeans(n_cluster)
        labels = km.fit_predict(preprocessed_data)
        
        # storing the labels for future file logging.
        labels_result[i,j] = list(labels)
        
        # Normalized mutual info metric
        nmi = normalized_mutual_info_score(data_classes, labels, average_method="geometric")
        nmi_metric.append((nmi, get_points(nmi), i, j))

### Results (NMI)

In [12]:
sorted_nmi = get_sorted_nmi(nmi_metric, nmi_d_type)

for execution in sorted_nmi:
    print(f"nmi {np.round(execution[0], 5)} | points {execution[1]} # k = {range_n_clusters[execution[2]]} | exec = {range_exec[execution[3]]}")

nmi 0.82783 | points 2 # k = 5 | exec = 4
nmi 0.82599 | points 2 # k = 5 | exec = 0
nmi 0.82215 | points 2 # k = 5 | exec = 3
nmi 0.8207 | points 2 # k = 5 | exec = 2
nmi 0.81881 | points 1 # k = 5 | exec = 1
nmi 0.79948 | points 0 # k = 6 | exec = 3
nmi 0.79495 | points 0 # k = 6 | exec = 4
nmi 0.78851 | points 0 # k = 6 | exec = 2
nmi 0.78806 | points 0 # k = 6 | exec = 1
nmi 0.77624 | points 0 # k = 6 | exec = 0
nmi 0.76027 | points 0 # k = 4 | exec = 0
nmi 0.75908 | points 0 # k = 10 | exec = 2
nmi 0.75886 | points 0 # k = 10 | exec = 3
nmi 0.75815 | points 0 # k = 4 | exec = 4
nmi 0.75681 | points 0 # k = 4 | exec = 1
nmi 0.75538 | points 0 # k = 4 | exec = 2
nmi 0.75056 | points 0 # k = 10 | exec = 0
nmi 0.74979 | points 0 # k = 4 | exec = 3
nmi 0.74772 | points 0 # k = 10 | exec = 1
nmi 0.73736 | points 0 # k = 10 | exec = 4
nmi 0.73618 | points 0 # k = 3 | exec = 4
nmi 0.73618 | points 0 # k = 3 | exec = 1
nmi 0.73618 | points 0 # k = 3 | exec = 2
nmi 0.73618 | points 0 # k = 3

### Save file with best result

In [13]:
output_result = True

if output_result:
    best_labels = sorted_nmi[0]
    
    # select the indices of the best result
    i, j = best_labels[2], best_labels[3]
    
    log_labels(labels_result[i,j], "k-means-best.txt")

## Agglomerative herarchical clustering

In [14]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

### Parameters

In [15]:
# Execution paramters of the algorithm
linkage_metrics = ['single', 'complete', 'average', 'centroid', 'median', 'ward']
range_n_clusters = range(2, 8)

# Parameters of the nmi metric.
nmi_metric = []
nmi_d_type = [('nmi', float), ('points', int), ('i', int), ('j', int)]

# Paramter for logging the result.
labels_result = np.zeros((len(linkage_metrics), len(range_n_clusters), len(data)))

### Execution

In [16]:
for i, linkage_metric in enumerate(linkage_metrics):
    for j, n_clusters in enumerate(range_n_clusters):
        X = linkage(preprocessed_data.to_numpy(), linkage_metric)
        labels = fcluster(X, n_clusters, criterion="maxclust")
        
        # storing the labels for future file logging.
        labels_result[i,j] = list(labels)
        
        # Normalized mutual info metric
        nmi = normalized_mutual_info_score(data_classes, labels, average_method="geometric")
        nmi_metric.append((nmi, get_points(nmi), i, j))

### Results (NMI)

In [17]:
sorted_nmi = get_sorted_nmi(nmi_metric, nmi_d_type)

for execution in sorted_nmi:
    print(f"nmi {np.round(execution[0], 5)} | points {execution[1]} # linkage = {linkage_metrics[execution[2]]} | k = {range_n_clusters[execution[3]]}")   

nmi 0.86673 | points 4 # linkage = ward | k = 7
nmi 0.82745 | points 2 # linkage = ward | k = 6
nmi 0.81825 | points 1 # linkage = ward | k = 5
nmi 0.74042 | points 0 # linkage = ward | k = 4
nmi 0.67428 | points 0 # linkage = ward | k = 3
nmi 0.54932 | points 0 # linkage = ward | k = 2
nmi 0.53529 | points 0 # linkage = complete | k = 7
nmi 0.51687 | points 0 # linkage = complete | k = 6
nmi 0.33709 | points 0 # linkage = complete | k = 5
nmi 0.27509 | points 0 # linkage = complete | k = 4
nmi 0.23972 | points 0 # linkage = complete | k = 3
nmi 0.07585 | points 0 # linkage = average | k = 7
nmi 0.07365 | points 0 # linkage = average | k = 6
nmi 0.04035 | points 0 # linkage = centroid | k = 7
nmi 0.03963 | points 0 # linkage = centroid | k = 6
nmi 0.03539 | points 0 # linkage = centroid | k = 5
nmi 0.03445 | points 0 # linkage = average | k = 5
nmi 0.03334 | points 0 # linkage = median | k = 7
nmi 0.03334 | points 0 # linkage = median | k = 6
nmi 0.0314 | points 0 # linkage = single | 

### Save file with best result 

In [18]:
output_result = True

if output_result:
    best_labels = sorted_nmi[0]
    
    # select the indices of the best result
    i, j = best_labels[2], best_labels[3]
    
    log_labels(labels_result[i,j], "agglomerative-best.txt")

## Spectral clustering

In [19]:
from sklearn.cluster import SpectralClustering

### Paramters

In [20]:
# Execution paramters of the algorithm
affinity_types = ['nearest_neighbors', 'rbf']
range_n_clusters = [4, 5, 6]
range_gamma = [0.5, 1.0, 1.5]

# Parameters of the nmi metric.
nmi_metric = []
nmi_d_type = [('nmi', float), ('points', int), ('i', int), ('j', int), ('k', int)]

# Paramter for logging the result.
labels_result = np.zeros((len(affinity_types), len(range_n_clusters), len(range_gamma), len(data)))

### Execution

In [21]:
for i, affinity in enumerate(affinity_types):
    for j, n_clusters in enumerate(range_n_clusters):
        for k, gamma in enumerate(range_gamma):
            sc = SpectralClustering(n_clusters=n_clusters, affinity=affinity, gamma=gamma)
            fit_cluster = sc.fit(preprocessed_data)
            labels = fit_cluster.labels_
            
            # storing the labels for future file logging.
            labels_result[i,j, k] = list(labels)

            # Normalized mutual info metric
            nmi = normalized_mutual_info_score(data_classes, labels, average_method="geometric")
            nmi_metric.append((nmi, get_points(nmi), i, j, k))

  ).fit(X, sample_weight=sample_weight)
  ).fit(X, sample_weight=sample_weight)
  ).fit(X, sample_weight=sample_weight)
  ).fit(X, sample_weight=sample_weight)
  ).fit(X, sample_weight=sample_weight)


### Results (NMI)

In [22]:
sorted_nmi = get_sorted_nmi(nmi_metric, nmi_d_type)

for execution in sorted_nmi:
    print(f"nmi {np.round(execution[0], 5)} | points {execution[1]} # affinity = {affinity_types[execution[2]]} | k = {range_n_clusters[execution[3]]} | gamma = {range_gamma[execution[4]]}")  

nmi 0.96127 | points 9 # affinity = nearest_neighbors | k = 5 | gamma = 0.5
nmi 0.96127 | points 9 # affinity = nearest_neighbors | k = 5 | gamma = 1.5
nmi 0.96127 | points 9 # affinity = nearest_neighbors | k = 5 | gamma = 1.0
nmi 0.87903 | points 4 # affinity = nearest_neighbors | k = 6 | gamma = 1.5
nmi 0.87903 | points 4 # affinity = nearest_neighbors | k = 6 | gamma = 1.0
nmi 0.87903 | points 4 # affinity = nearest_neighbors | k = 6 | gamma = 0.5
nmi 0.83916 | points 2 # affinity = nearest_neighbors | k = 4 | gamma = 1.5
nmi 0.83916 | points 2 # affinity = nearest_neighbors | k = 4 | gamma = 1.0
nmi 0.83916 | points 2 # affinity = nearest_neighbors | k = 4 | gamma = 0.5
nmi 0.02832 | points 0 # affinity = rbf | k = 6 | gamma = 0.5
nmi 0.02431 | points 0 # affinity = rbf | k = 5 | gamma = 1.0
nmi 0.02431 | points 0 # affinity = rbf | k = 5 | gamma = 0.5
nmi 0.02283 | points 0 # affinity = rbf | k = 6 | gamma = 1.0
nmi 0.02283 | points 0 # affinity = rbf | k = 4 | gamma = 1.0
nmi 0.

### Save file with best result

In [1]:
output_result = True

if output_result:
    best_labels = sorted_nmi[0]
    
    # select the indices of the best result
    i, j, k = best_labels[2], best_labels[3], best_labels[4]
    
    log_labels(labels_result[i,j,k], "spectral-best.txt")

NameError: name 'sorted_nmi' is not defined