# MSdata Task

**Preprocessing:**

* Scaling
    * Mean centering and dividing by standard deviation

**Methods:**

* K-means
* Spectral clustering
* Agglomerative clustering

**Results:**

Na

## Setup

In [1]:
import pandas as pd
import numpy as np

# Clustering methods
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import SpectralClustering

# Metrics
from sklearn.metrics.cluster import normalized_mutual_info_score

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import DistanceMetric

In [2]:
df = pd.read_csv("./data/msdata.csv", header = 0)
df

Unnamed: 0,id,class,f1,f2,f3,f4,f5,f6,f7,f8,...,f4991,f4992,f4993,f4994,f4995,f4996,f4997,f4998,f4999,f5000
0,1,1,14.7476,15.3005,17.4703,14.8223,17.0740,14.7568,10.6366,11.3165,...,17.8866,6.8360,12.7977,15.6400,16.1506,17.1062,14.7575,13.4128,14.1470,18.4883
1,2,1,16.1612,15.6718,17.6609,17.6090,17.4288,15.4954,9.5740,14.8143,...,17.7767,15.2447,13.9961,15.9585,16.4648,17.4712,15.0304,14.1189,14.8040,19.4082
2,3,1,16.3225,15.4946,17.3564,14.8182,17.7625,14.4539,11.1484,13.8300,...,17.6984,11.6416,12.8275,14.9895,16.4143,17.6745,14.7615,12.9050,14.7798,18.9794
3,4,1,15.4337,14.6519,17.4111,16.6048,17.2341,15.0404,11.4655,10.3837,...,17.7769,15.0042,13.2926,16.0378,16.1182,16.7441,14.8373,13.3637,13.9702,18.1746
4,5,1,16.0579,15.1974,17.4781,16.7797,17.6097,15.4278,8.7012,13.9750,...,17.6669,15.1619,13.6530,15.3895,16.2457,17.4527,14.9707,12.5546,14.8331,19.0512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689,690,3,11.0183,8.9493,17.5035,15.0010,17.5672,14.4627,8.2306,14.3206,...,9.9268,15.2167,13.5536,15.7954,9.8979,17.8046,18.2610,7.8796,6.9577,13.7735
690,691,3,8.9789,15.5676,17.6476,17.3428,10.6311,14.4161,16.4037,14.5159,...,17.8073,14.7698,14.6110,16.0002,16.1689,17.4165,18.3884,8.8354,10.4884,19.0953
691,692,3,16.1161,15.4710,9.8349,18.5084,8.6035,7.5401,14.2796,13.7709,...,17.6998,16.0514,9.4812,10.6424,16.0951,8.7562,18.6430,13.3147,13.6202,18.1803
692,693,3,16.0459,15.2779,17.5605,17.4232,16.8446,14.3926,17.6199,14.3125,...,17.8582,15.2462,14.0752,15.8942,15.9932,17.3534,18.6915,13.2952,14.7476,18.4206


In [3]:
labels_true = df["class"].to_numpy()
data = df.iloc[:, 2:].to_numpy()
data = data.astype(np.float)

In [4]:
print(data.max())
print(data.min())

26.3404
0.0


In [5]:
# Scoring helper to ensure geometric averaging method is always used
def nmi_score(labels_true, labels_predicted):
    return normalized_mutual_info_score(labels_true, labels_predicted, average_method="geometric")

## Preprocessing

In [6]:
# Standard scaling: mean center and divide by standard deviation
s_scaler = StandardScaler()
data_ss = s_scaler.fit_transform(data)

In [7]:
# Max abs scaling: scale each future by its max abs value. No centering
mabs_scaler = MaxAbsScaler()
data_mabs = mabs_scaler.fit_transform(data)

In [8]:
# Max abs + sigmoid
data_sig = 1 / (1 + np.exp(-data_ss))
data_mabs_sig = 1 / (1 + np.exp(-data_mabs))

In [9]:
data_pca = []
for i in range(600, 601, 20):
    print(i)
    data_pca.append(PCA(n_components=i).fit_transform(data_sig))

data_pca = PCA(n_components=600).fit_transform(data)
data_mabs_pca = PCA(n_components=600).fit_transform(data)

600


In [10]:
# Cosine similarity
data_cs = cosine_similarity(data)
data_cs_mabs = cosine_similarity(data_mabs)

In [11]:
# Euclidian similarity
data_es = 1 / (1 + euclidean_distances(data, data))
data_es_ss = 1 / (1 + euclidean_distances(data_ss, data_ss))
data_es_mabs = 1 / (1 + euclidean_distances(data_mabs, data_mabs))
data_es_mabs_2 = 1 / np.exp(euclidean_distances(data_mabs, data_mabs))

In [13]:
def to_similarity(X):
    return 1 / 1 + X

In [14]:
dist = DistanceMetric.get_metric('mahalanobis', VI = np.cov(data_mabs, rowvar = False))
data_mah = to_similarity(dist.pairwise(data_mabs))

In [86]:
np.cov(data_mabs).shape

(694, 694)

### WO/ Scaling

In [13]:
labels_km = []
scores_km = []

for i in range(2, 10):
    print(i)
    km = KMeans(n_clusters=int(i), algorithm="full").fit(data)
    labels = km.predict(data)
    labels_km.append(labels)
    scores_km.append(nmi_score(labels_true, labels))

print()
for i, s in enumerate(scores_km):
    print("K = {}: {}".format(i+2, round(s, 4)))

2
3
4
5
6
7
8
9

K = 2: 0.0003
K = 3: 0.1234
K = 4: 0.1056
K = 5: 0.1184
K = 6: 0.1161
K = 7: 0.0725
K = 8: 0.1191
K = 9: 0.0926


### W/ scaling

In [14]:
# Standart scaling
labels_km_ss = []
scores_km_ss = []

for i in range(2, 10):
    print(i)
    km = KMeans(n_clusters=int(i), algorithm="full").fit(data_ss)
    labels = km.predict(data_ss)
    labels_km_ss.append(labels)
    scores_km_ss.append(nmi_score(labels_true, labels))

print()
for i, s in enumerate(scores_km_ss):
    print("K = {}: {}".format(i+2, round(s, 4)))

2
3
4
5
6
7
8
9

K = 2: 0.0003
K = 3: 0.0031
K = 4: 0.1203
K = 5: 0.0083
K = 6: 0.0961
K = 7: 0.1774
K = 8: 0.1274
K = 9: 0.1436


In [15]:
# Maximum absolute scaling
labels_km_mabs = []
scores_km_mabs = []

for i in range(2, 10):
    print(i)
    km = KMeans(n_clusters=int(i), algorithm="full").fit(data_mabs)
    labels = km.predict(data_mabs)
    labels_km_mabs.append(labels)
    scores_km_mabs.append(nmi_score(labels_true, labels))

print()
for i, s in enumerate(scores_km_mabs):
    print("K = {}: {}".format(i+2, round(s, 4)))

2
3
4
5
6
7
8
9

K = 2: 0.1303
K = 3: 0.1649
K = 4: 0.0064
K = 5: 0.0083
K = 6: 0.0833
K = 7: 0.0928
K = 8: 0.1107
K = 9: 0.1231


In [16]:
results_km = {
    "K": list(range(2, 10)),
    "No": scores_km,
    "SS": scores_km_ss,
    "MABS": scores_km_mabs
}
pd.DataFrame(results_km)

Unnamed: 0,K,No,SS,MABS
0,2,0.000285,0.000285,0.1303
1,3,0.123413,0.003112,0.164914
2,4,0.105572,0.120323,0.006376
3,5,0.118397,0.008266,0.008266
4,6,0.11613,0.096138,0.083326
5,7,0.072541,0.177368,0.092821
6,8,0.119134,0.127375,0.110714
7,9,0.092578,0.143556,0.123068


From these results we can conclude that K-means is probably not going to work.

## Spectral Clustering

1. Nearest neighbor with nn = (1, 200) doesn't work
2. Standard scaled data with nn = (3, 20) doesn't work
3. PCA with nn = (3, 20) doesn't work
4. PCA with rbf doesn't work

### Nearest Neighbor

In [19]:
scores_spc_nn = []
labels_spc_nn = []

i = 0
for data_set_num, dt in enumerate([data_ss]): # Data preprocess type
    for nc in range(4, 7): # Cluster num
        for nn in range(3, 8, 2):
            spc = SpectralClustering(
                        n_clusters = nc,
                        affinity = "nearest_neighbors",
                        n_neighbors = nn,
                        n_jobs = 3
                    )

            labels = spc.fit_predict(dt)
            labels_spc_nn.append(labels)

            score = [data_set_num, nc, nn, nmi_score(labels_true, labels)]
            scores_spc_nn.append(score)

            print(score)
            if score[3] > 0.8:
                print("!!!!!!!")
                print("!!!!!!!")

[0, 4, 3, 0.029432307689934798]
[0, 4, 5, 0.030658931204656386]
[0, 4, 7, 0.008991302015115252]
[0, 5, 3, 0.01418595991360131]
[0, 5, 5, 0.014802242890002431]
[0, 5, 7, 0.017615166718864154]
[0, 6, 3, 0.013089927840981017]
[0, 6, 5, 0.013251677198136589]
[0, 6, 7, 0.018563208442725705]


In [20]:
pd.DataFrame(scores_spc_nn, columns = ["dt", "CN", "NN", "Score"]).iloc[:, 1:4]

Unnamed: 0,CN,NN,Score
0,4,3,0.029432
1,4,5,0.030659
2,4,7,0.008991
3,5,3,0.014186
4,5,5,0.014802
5,5,7,0.017615
6,6,3,0.01309
7,6,5,0.013252
8,6,7,0.018563


### RBF

In [26]:
scores_spc_rbf_1 = []
labels_spc_rbf_1 = []

for nc in range(4, 8):
    spc = SpectralClustering(
                n_clusters = 5,
                affinity = "rbf",
                n_jobs = 3
            )

    labels = spc.fit_predict(data)
    labels_spc_rbf_1.append(labels)

    score = nmi_score(labels_true, labels)
    scores_spc_rbf_1.append(score)

    print(score)


scores_spc_rbf_2 = []
labels_spc_rbf_2 = []

for nc in range(4, 8):
    spc = SpectralClustering(
                n_clusters = 5,
                affinity = "rbf",
                n_jobs = 3
            )

    labels = spc.fit_predict(data_mabs)
    labels_spc_rbf_2.append(labels)

    score = nmi_score(labels_true, labels)
    scores_spc_rbf_2.append(score)

    print(score)


scores_spc_rbf_3 = []
labels_spc_rbf_3 = []

for nc in range(4, 8):
    spc = SpectralClustering(
                n_clusters = 5,
                affinity = "rbf",
                n_jobs = 3
            )

    labels = spc.fit_predict(data_mabs)
    labels_spc_rbf_3.append(labels)

    score = nmi_score(labels_true, labels)
    scores_spc_rbf_3.append(score)

    print(score)


scores_spc_rbf_4 = []
labels_spc_rbf_4 = []

for nc in range(4, 8):
    spc = SpectralClustering(
                n_clusters = 5,
                affinity = "rbf",
                n_jobs = 3
            )

    labels = spc.fit_predict(data_pca)
    labels_spc_rbf_4.append(labels)

    score = nmi_score(labels_true, labels)
    scores_spc_rbf_4.append(score)

    print(score)

0.0031275416551190447
0.008745499615900612
0.006413491144638863
0.008137588318863007
0.07143675807501862
0.06227789524103876
0.05194580825319134
0.05536620741763816
0.05904075397399397
0.044361607904779235
0.08000971313978021
0.06951851411896882
0.016274299573831273
0.016274299573831273
0.016274299573831273
0.016274299573831273


In [27]:
results_km = {
    "NC": list(range(4, 8)),
    "No": scores_spc_rbf_1,
    "SS": scores_spc_rbf_3,
    "MABS": scores_spc_rbf_2,
    "PCA": scores_spc_rbf_4
}
pd.DataFrame(results_km)

Unnamed: 0,NC,No,SS,MABS,PCA
0,4,0.003128,0.059041,0.071437,0.016274
1,5,0.008745,0.044362,0.062278,0.016274
2,6,0.006413,0.08001,0.051946,0.016274
3,7,0.008138,0.069519,0.055366,0.016274


### Affinity matrices

In [72]:
# Cosine similarity with no preprocessing
scores_cs = []
labels_cs = []

for nc in range(3, 8):
    spc = SpectralClustering(
                n_clusters = nc,
                affinity = "precomputed",
                n_jobs = 3
            )

    labels = spc.fit_predict(data_cs)
    labels_cs.append(labels)

    score = nmi_score(labels_true, labels)
    scores_cs.append(score)

    print(score)

# Cosine similarity with mabs
scores_cs_mabs = []
labels_cs_mabs = []

for nc in range(3, 8):
    spc = SpectralClustering(
                n_clusters = nc,
                affinity = "precomputed",
                n_jobs = 3
            )

    labels = spc.fit_predict(data_cs_mabs)
    labels_cs_mabs.append(labels)

    score = nmi_score(labels_true, labels)
    scores_cs_mabs.append(score)

    print(score)

0.8375146837574468
0.6979590470729776
0.5798233768079831
0.5535243763949368
0.5334360045930528
0.8655733594390272
0.7324047776072706
0.6151560292533703
0.5632951871016115
0.5540761960426362


In [73]:
results_cs = {
    "CN": list(range(3, 8)),
    "score": scores_cs
}
pd.DataFrame(results_cs)

Unnamed: 0,CN,score
0,3,0.837515
1,4,0.697959
2,5,0.579823
3,6,0.553524
4,7,0.533436


In [75]:
results_cs_mabs = {
    "CN": list(range(3, 8)),
    "score": scores_cs_mabs
}
pd.DataFrame(results_cs_mabs)

Unnamed: 0,CN,score
0,3,0.865573
1,4,0.732405
2,5,0.615156
3,6,0.563295
4,7,0.554076


In [78]:
# Euclidean distance similarity with no preprocessing
scores_es = []
labels_es = []

for nc in range(3, 8):
    spc = SpectralClustering(
                n_clusters = nc,
                affinity = "precomputed",
                n_jobs = 3
            )

    labels = spc.fit_predict(data_es)
    labels_es.append(labels)

    score = nmi_score(labels_true, labels)
    scores_es.append(score)

    print(nmi_score(labels_true, labels))

print()

# Euclidean distance similarity with mabs
scores_es_mabs = []
labels_es_mabs = []

for nc in range(3, 8):
    spc = SpectralClustering(
                n_clusters = nc,
                affinity = "precomputed",
                n_jobs = 3
            )

    labels = spc.fit_predict(data_es_mabs)
    labels_es_mabs.append(labels)

    score = nmi_score(labels_true, labels)
    scores_es_mabs.append(score)

    print(nmi_score(labels_true, labels))

print()

# Euclidean distance similarity with standard scaling
scores_es_ss = []
labels_es_ss = []

for nc in range(3, 8):
    spc = SpectralClustering(
                n_clusters = nc,
                affinity = "precomputed",
                n_jobs = 3
            )

    labels = spc.fit_predict(data_es_ss)
    labels_es_ss.append(labels)

    score = nmi_score(labels_true, labels)
    scores_es_ss.append(score)

    print(nmi_score(labels_true, labels))

0.5823011762355861
0.5701940923694606
0.5782035367443346
0.8847072117644724
0.8477113264457047

0.5823011762355861
0.5731101773665491
0.9026983129728001
0.8835824001007563
0.739370863023881

0.5735824116472192
0.5731101773665491
0.5833118113338115
0.8813397031939731
0.7000086946588405


In [79]:
results_es = {
    "CN": list(range(3, 8)),
    "score": scores_es
}
pd.DataFrame(results_es)

Unnamed: 0,CN,score
0,3,0.582301
1,4,0.570194
2,5,0.578204
3,6,0.884707
4,7,0.847711


In [80]:
results_es_ss = {
    "CN": list(range(3, 8)),
    "score": scores_es_ss
}
pd.DataFrame(results_es_ss)

Unnamed: 0,CN,score
0,3,0.573582
1,4,0.57311
2,5,0.583312
3,6,0.88134
4,7,0.700009


In [81]:
results_es_mabs = {
    "CN": list(range(3, 8)),
    "score": scores_es_mabs
}
pd.DataFrame(results_es_mabs)

Unnamed: 0,CN,score
0,3,0.582301
1,4,0.57311
2,5,0.902698
3,6,0.883582
4,7,0.739371


!!!The best one is the one below!!!

In [54]:
spc = SpectralClustering(
            n_clusters = 5,
            affinity = "precomputed",
            n_jobs = 3
        )

labels = spc.fit_predict(data_es_mabs)
print(nmi_score(labels_true, labels))

max_labels = labels

0.9026983129728001


## Solution

The best method we found was spectral clustering with maximum absolute scaling (-1, 1) with 5 clusters using eucledian distance made into similarity with 1 / (1 + D).

In [55]:
with open("solution2.txt", "w") as solution:
    for l in labels:
        solution.write("{}\n".format(str(l)))