# Genedata Task

**Preprocessing:**

* Scaling
    * Mean centering and dividing by standard deviation
    * Maximum absolute value Scaling
    * PCA

**Methods:**

* K-means
* Spectral clustering

**Results:**

Best NMI score: 0.99

Using:
* Spectral clustering
* 5 clusters
* Nearest neighbor affinity with n = 5
* PCA with 600 components kept

## Setup

In [194]:
import pandas as pd
import numpy as np

# Clustering methods
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering

# Metrics
from sklearn.metrics.cluster import normalized_mutual_info_score

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.decomposition import PCA

In [61]:
df = pd.read_csv("./data/genedata.csv", header = 0)
df

Unnamed: 0,id,class,f1,f2,f3,f4,f5,f6,f7,f8,...,f6991,f6992,f6993,f6994,f6995,f6996,f6997,f6998,f6999,f7000
0,1,5,2.3383,10.5440,11.4653,10.4441,10.6311,7.7036,9.6444,7.8501,...,11.9394,11.1349,5.7066,8.2176,9.9062,9.0815,10.5098,11.8648,8.4341,6.7618
1,2,4,3.9151,9.5815,10.3992,9.8333,8.9781,7.0265,9.2761,7.3903,...,11.6547,7.7178,9.3558,7.5026,10.2106,8.9278,10.2466,11.3329,7.8209,7.4350
2,3,5,2.9322,9.3102,11.0756,9.8490,9.4291,8.0618,8.3874,7.9274,...,11.9372,11.2369,6.5860,9.1668,10.3315,9.7551,11.4392,10.7036,8.0041,8.5380
3,4,5,3.9470,8.5315,10.9836,9.3676,9.6856,7.5842,8.5774,8.3737,...,13.2541,11.8221,9.5261,7.4231,11.2862,9.8992,10.4067,11.9044,7.9843,8.5370
4,5,1,4.4618,9.1392,10.9183,10.0490,9.8290,6.5658,9.8122,9.0646,...,11.6416,11.5142,6.5539,7.8319,11.0100,10.1013,11.2682,10.8969,8.1959,8.2906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
790,791,1,4.1916,8.8362,11.0111,9.6675,10.0520,6.2790,9.8611,7.7249,...,7.0101,10.7721,5.6539,8.3391,11.5969,10.1878,10.9634,10.9893,9.0314,8.8375
791,792,4,4.5846,9.1122,11.6310,9.5083,10.2390,9.3738,9.8059,7.8844,...,12.4231,10.3401,4.9764,8.8977,10.0941,8.9387,10.5346,11.2762,8.4321,7.2204
792,793,3,4.6489,9.0206,10.6455,9.8452,10.4445,9.0456,10.5651,6.7399,...,7.3802,8.7899,7.2340,9.1537,10.5056,9.7452,10.1808,11.4399,8.3824,8.2929
793,794,5,3.3839,10.6693,11.4684,10.4659,9.8408,9.3388,9.4965,7.5221,...,11.5339,10.8192,7.1455,8.9878,11.2424,9.8839,10.3835,11.5674,8.3261,9.6444


In [83]:
labels_true = df["class"].to_numpy()
data = df.iloc[:, 2:].to_numpy()
data = data.astype(np.float)

In [96]:
# Scoring helper to ensure geometric averaging method is always used
def nmi_score(labels_true, labels_predicted):
    return normalized_mutual_info_score(labels_true, labels_predicted, average_method="geometric")

## Preprocessing

In [93]:
# Standard scaling: mean center and divide by standard deviation
s_scaler = StandardScaler()
data_ss = s_scaler.fit_transform(data)

In [94]:
# Max abs scaling: scale each future by its max abs value. No centering
mabs_scaler = MaxAbsScaler()
data_mabs = mabs_scaler.fit_transform(data)

In [199]:
data_pca = []
for i in range(2, 40, 5):
    print(i)
    data_pca.append(PCA(n_components=600).fit_transform(data))

2
7
12
17
22
27
32
37


In [141]:
data_list = [data, data_ss, data_mabs]

## K-means

### No preprocessing

In [86]:
labels_km = []
scores_km = []

for i in range(2, 10):
    print(i)
    km = KMeans(n_clusters=int(i), algorithm="full").fit(data)
    labels = km.predict(data)
    labels_km.append(labels)
    scores_km.append(nmi_score(labels_true, labels))

print()
for i, s in enumerate(scores_km):
    print("K = {}: {}".format(i+2, round(s, 4)))

2
3
4
5
6
7
8
9

K = 2: 0.5341
K = 3: 0.7376
K = 4: 0.8147
K = 5: 0.8569
K = 6: 0.8912
K = 7: 0.8585
K = 8: 0.8239
K = 9: 0.7983


### W/ standard scaling

In [142]:
labels_km_pr = []
scores_km_pr = []

for i in range(2, 10):
    print(i)
    km = KMeans(n_clusters=int(i), algorithm="full").fit(data_ss)
    labels = km.predict(data_ss)
    labels_km_pr.append(labels)
    scores_km_pr.append(nmi_score(labels_true, labels))

print()
for i, s in enumerate(scores_km_pr):
    print("K = {}: {}".format(i+2, round(s, 4)))

2
3
4
5
6
7
8
9

K = 2: 0.5161
K = 3: 0.4814
K = 4: 0.712
K = 5: 0.8004
K = 6: 0.754
K = 7: 0.7833
K = 8: 0.7919
K = 9: 0.7829


### W/ Max abs scaling

In [91]:
labels_km_mabs = []
scores_km_mabs = []

for i in range(2, 10):
    print(i)
    km = KMeans(n_clusters=int(i), algorithm="full").fit(data_mabs)
    labels = km.predict(data_mabs)
    labels_km_mabs.append(labels)
    scores_km_mabs.append(nmi_score(labels_true, labels))

print()
for i, s in enumerate(scores_km_mabs):
    print("K = {}: {}".format(i+2, round(s, 4)))

2
3
4
5
6
7
8
9

K = 2: 0.5341
K = 3: 0.7363
K = 4: 0.8085
K = 5: 0.8535
K = 6: 0.8288
K = 7: 0.8414
K = 8: 0.7999
K = 9: 0.7863


## Spectral Clustering

### Nearest Neighbor

In [160]:
scores_spc_nn_1 = []
labels_spc_nn_1 = []

i = 0
for data_set_num, dt in enumerate([data]): # Data preprocess type
    for nc in range(4, 7): # Cluster num
        for nn in range(4, 10):
            spc = SpectralClustering(
                                n_clusters = nc,
                                affinity = "nearest_neighbors",
                                n_neighbors = nn,
                                n_jobs = 3
                            )

            labels = spc.fit_predict(dt)
            labels_spc_nn_1.append(labels)

            score = [data_set_num, nc, nn, nmi_score(labels_true, labels)]
            print(score)
            scores_spc_nn_1.append(score)

[0, 4, 4, 0.9242274366149583]
[0, 4, 5, 0.9242274366149583]
[0, 4, 6, 0.9242274366149583]
[0, 4, 7, 0.9242274366149583]
[0, 4, 8, 0.9242274366149583]
[0, 4, 9, 0.8660362807864445]
[0, 5, 4, 0.9806599357371502]
[0, 5, 5, 0.9847702095491678]
[0, 5, 6, 0.9757257926855084]
[0, 5, 7, 0.9762785050842377]
[0, 5, 8, 0.9707082163427543]
[0, 5, 9, 0.9721787806206892]
[0, 6, 4, 0.9245560556676519]
[0, 6, 5, 0.937244416777298]
[0, 6, 6, 0.9329041202533983]
[0, 6, 7, 0.9295109398012779]
[0, 6, 8, 0.9301271802956209]
[0, 6, 9, 0.9344621922389881]


In [161]:
scores_spc_nn_2 = []
labels_spc_nn_2 = []

i = 0
for data_set_num, dt in enumerate([data_ss]): # Data preprocess type
    for nc in range(4, 7): # Cluster num
        for nn in range(4, 10):
            spc = SpectralClustering(
                                n_clusters = nc,
                                affinity = "nearest_neighbors",
                                n_neighbors = nn,
                                n_jobs = 3
                            )

            labels = spc.fit_predict(dt)
            labels_spc_nn_2.append(labels)

            score = [data_set_num, nc, nn, nmi_score(labels_true, labels)]
            print(score)
            scores_spc_nn_2.append(score)

[0, 4, 4, 0.8965106299116461]
[0, 4, 5, 0.8981643894047692]
[0, 4, 6, 0.8981643894047693]
[0, 4, 7, 0.8391665746262451]
[0, 4, 8, 0.8277858705114417]
[0, 4, 9, 0.8364955870323535]
[0, 5, 4, 0.9088623597457636]
[0, 5, 5, 0.9485802550113133]
[0, 5, 6, 0.9521905204425134]
[0, 5, 7, 0.9665000951941086]
[0, 5, 8, 0.9527958938773573]
[0, 5, 9, 0.9434884221018295]
[0, 6, 4, 0.9086605946294803]
[0, 6, 5, 0.9088108202075945]
[0, 6, 6, 0.9099455343143567]
[0, 6, 7, 0.924778608492221]
[0, 6, 8, 0.909440215045878]
[0, 6, 9, 0.9034440233920327]


In [177]:
df_spc_nn_2 = pd.DataFrame(data = scores_spc_nn_2, columns = ["ds", "nc", "nn", "score"])
df_spc_nn_2.iloc[df_spc_nn_2["score"].idxmax(),:] 

ds       0.0000
nc       5.0000
nn       7.0000
score    0.9665
Name: 9, dtype: float64

In [173]:
scores_spc_nn_3 = []
labels_spc_nn_3 = []

i = 0
for data_set_num, dt in enumerate([data_mabs]): # Data preprocess type
    for nc in range(4, 7): # Cluster num
        for nn in range(4, 10):
            spc = SpectralClustering(
                                n_clusters = nc,
                                affinity = "nearest_neighbors",
                                n_neighbors = nn,
                                n_jobs = 3
                            )

            labels = spc.fit_predict(dt)
            labels_spc_nn_3.append(labels)

            score = [data_set_num, nc, nn, nmi_score(labels_true, labels)]
            print(score)
            scores_spc_nn_3.append(score)

[0, 4, 4, 0.9205179669041921]
[0, 4, 5, 0.861240396022072]
[0, 4, 6, 0.861449529140161]
[0, 4, 7, 0.8660362807864445]
[0, 4, 8, 0.9242274366149583]
[0, 4, 9, 0.8598605227269055]
[0, 5, 4, 0.9774353402417695]
[0, 5, 5, 0.975961611862692]
[0, 5, 6, 0.9566327824128877]
[0, 5, 7, 0.9774353402417695]
[0, 5, 8, 0.9654463689286721]
[0, 5, 9, 0.9774353402417695]
[0, 6, 4, 0.9209520935768423]
[0, 6, 5, 0.9323883209786678]
[0, 6, 6, 0.9256868597089081]
[0, 6, 7, 0.9295109398012779]
[0, 6, 8, 0.9295109398012779]
[0, 6, 9, 0.9344621922389882]


In [234]:
df_spc_nn_3 = pd.DataFrame(data = scores_spc_nn_3, columns = ["ds", "nc", "nn", "score"])
df_spc_nn_3.iloc[df_spc_nn_3["score"].idxmax(),:] 

ds       0.000000
nc       5.000000
nn       4.000000
score    0.977435
Name: 6, dtype: float64

Spectral clustering with nearest_neighbor affinity with k=4|5 seems to work the best

### PCA

In [250]:
scores_spc_nn_pca = []
labels_spc_nn_pca = []

spc = SpectralClustering(
                                n_clusters = 5,
                                affinity = "nearest_neighbors",
                                n_neighbors = 5,
                                n_jobs = 3
                            )

for i, da in enumerate(data_pca):
    labels = spc.fit_predict(da)
    score = nmi_score(labels_true, labels)

    scores_spc_nn_pca.append(score)
    labels_spc_nn_pca.append(labels)

    print(i, score)
    if score > 0.99:
        print("x")

0 0.9850810002988509
1 0.9903424385740336
x
2 0.9850810002988509
3 0.9903424385740336
x
4 0.9773028599033775
5 0.9850810002988509
6 0.9773028599033773
7 0.9850810002988512


In [251]:
max_i = np.argmax(scores_spc_nn_pca)
max_labels = labels_spc_nn_pca[max_i]
scores_spc_nn_pca[max_i]

0.9903424385740336

This is the best NMI score I've found. PCA with 600 components, spectral clustering with nearest neighbor with n = 5.

### RBF

In [181]:
scores_spc_rbf = []
labels_spc_rbf = []

i = 0
for data_set_num, dt in enumerate([data, data_ss]): # Data preprocess type
    for nc in range(4, 7): # Cluster num
        spc = SpectralClustering(
                            n_clusters = nc,
                            affinity = "rbf",
                            n_jobs = 3
                        )

        labels = spc.fit_predict(dt)
        labels_spc_rbf.append(labels)

        score = [data_set_num, nc, nmi_score(labels_true, labels)]
        print(score)
        scores_spc_rbf.append(score)

[0, 4, 0.007488684271591476]
[0, 5, 0.0045631432771643995]
[0, 6, 0.005780843740122316]
[1, 4, 0.003801310684798419]
[1, 5, 0.004920963380551922]
[1, 6, 0.0070302288715797835]


In [183]:
df_spc_rbf = pd.DataFrame(data = scores_spc_rbf, columns = ["ds", "nc", "score"])
df_spc_rbf.iloc[df_spc_rbf["score"].idxmax(),:] 

ds       0.000000
nc       4.000000
score    0.007489
Name: 0, dtype: float64

## Solution

The best method we found was spectral clustering with pca preprocessed data with 600 components kept, 5 clusters, affinity matrix constructed with n-nearest neighbors with n = 5. The resulting NMI was ~0.9903.

In [252]:
with open("solution1.txt", "w") as solution:
    for l in max_labels:
        solution.write("{}\n".format(str(l)))