In [1]:
# load train and test data 
import pandas as pd

train_data_path = 'data/Train/'
trainX = pd.read_csv(train_data_path + 'X_train.txt',sep=" ", header=None).values
trainY = pd.read_csv(train_data_path + 'y_train.txt', header=None).values.ravel()

In [2]:
import matplotlib.pyplot as plt

def plot(name, t, x):
    plt.plot(t, x)
    plt.legend([name], loc='upper left')
    plt.show() 

In [16]:
from sklearn.cluster import KMeans
import numpy as np

min_mse = np.inf
min_cluster = None
repeat_number = 10
for i in range(repeat_number):
    kmeans = KMeans(init='k-means++', n_clusters=12).fit(trainX)
    mse = kmeans.inertia_ / trainX.shape[0]
    if mse <  min_mse :
        min_mse = mse
        min_cluster = kmeans
        
print('Minimum of sum of squared distances of samples to their closest cluster center is: ', min_mse)

Minimum of sum of squared distances of samples to their closest cluster center is:  15.733299178365757


In [6]:
from sklearn import metrics
rand_index = metrics.adjusted_rand_score(trainY, min_cluster.labels_)
print('Rand index is: ', rand_index)
homogeneity =  metrics.homogeneity_score(trainY, min_cluster.labels_)
print('Homogeneity is: ', homogeneity)
completeness = metrics.completeness_score(trainY, min_cluster.labels_)
print('Completeness is: ', completeness)

Rand index is:  0.3941281458089616
Homogeneity is:  0.6657748017425962
Completeness is:  0.5499334830123643


In [14]:
from sklearn.mixture import GaussianMixture
import numpy as np

min_mse = np.inf
min_cluster = None
repeat_number = 10
for i in range(repeat_number):
    gmm = GaussianMixture(n_components=12,covariance_type='diag').fit(trainX)
    cov = gmm.covariances_
    mse = np.sum(np.diag(cov))
    if mse <  min_mse :
        min_mse = mse
        min_cluster = gmm
        
print('Minimum of sum of squared distances of samples to their closest cluster center is: ', min_mse)

Minimum of sum of squared distances of samples to their closest cluster center is:  0.17111098772918534


In [15]:
pred_y = min_cluster.predict(trainX)
from sklearn import metrics
rand_index = metrics.adjusted_rand_score(trainY, pred_y)
print('Rand index is: ', rand_index)
homogeneity =  metrics.homogeneity_score(trainY, pred_y)
print('Homogeneity is: ', homogeneity)
completeness = metrics.completeness_score(trainY, pred_y)
print('Completeness is: ', completeness)

Rand index is:  0.2577252909384135
Homogeneity is:  0.5008801501528031
Completeness is:  0.42589980001056144


In [10]:
from sklearn.decomposition import PCA
model = PCA(n_components=20).fit(trainX)
new_trainX = model.transform(trainX)

In [11]:
from sklearn.cluster import KMeans
import numpy as np

min_mse = np.inf
min_cluster = None
repeat_number = 10
for i in range(repeat_number):
    kmeans = KMeans(init='k-means++', n_clusters=12).fit(new_trainX)
    mse = kmeans.inertia_ / new_trainX.shape[0]
    if mse <  min_mse :
        min_mse = mse
        min_cluster = kmeans
        
print('Minimum of sum of squared distances of samples to their closest cluster center is: ', min_mse)

Minimum of sum of squared distances of samples to their closest cluster center is:  7.833164756628481


In [12]:
from sklearn import metrics
rand_index = metrics.adjusted_rand_score(trainY, min_cluster.labels_)
print('Rand index is: ', rand_index)
homogeneity =  metrics.homogeneity_score(trainY, min_cluster.labels_)
print('Homogeneity is: ', homogeneity)
completeness = metrics.completeness_score(trainY, min_cluster.labels_)
print('Completeness is: ', completeness)

Rand index is:  0.3927873242960771
Homogeneity is:  0.6654843515647444
Completeness is:  0.5502550626931366


In [7]:
from sklearn.mixture import GaussianMixture
import numpy as np

min_mse = np.inf
min_cluster = None
repeat_number = 10
for i in range(repeat_number):
    gmm = GaussianMixture(n_components=12,covariance_type='diag').fit(new_trainX)
    cov = gmm.covariances_
    mse = np.sum(np.diag(cov))
    if mse <  min_mse :
        min_mse = mse
        min_cluster = gmm
        
print('Minimum of sum of squared distances of samples to their closest cluster center is: ', min_mse)

Minimum of sum of squared distances of samples to their closest cluster center is:  5.370258281060223


In [9]:
pred_y = min_cluster.predict(new_trainX)
from sklearn import metrics
rand_index = metrics.adjusted_rand_score(trainY, pred_y)
print('Rand index is: ', rand_index)
homogeneity =  metrics.homogeneity_score(trainY, pred_y)
print('Homogeneity is: ', homogeneity)
completeness = metrics.completeness_score(trainY, pred_y)
print('Completeness is: ', completeness)

Rand index is:  0.40766806042877374
Homogeneity is:  0.6803494351390976
Completeness is:  0.5520642983756733


<div style="direction:rtl;line-height:300%;">
    <font face="XB Zar" size=3>
        با اعمال کاهش ابعاد MSE بیشتر شده است چرا که یک سری اطلاعات از دست رفته و باعث افزایش خطا شده است.
        ولی با این وجود rand index و homogenity و completeness نیز بیشتر شده است که نشان می‌دهد با این که فاصله تا مراکز بیشتر شده 
        ولی نسبت به برچسب های داده شده بهتر دسته بندی کرده است که این یعنی از ویژگی های مفید تر برای خوشه بندی استفاده کرده و برچسب های با کمترین اثر را دور ریخته است.
    </font>
</div>