In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score

In [2]:
list_of_filepaths = ["GSM2699154/2699154_prep_file3.csv", "GSM2699155/2699155_prep_file3.csv",
                     "GSM2699156/2699156_prep_file3.csv", "GSM2699157/2699157_prep_file3.csv",
                     "GSM3140915/3140915_prep_file3.csv", "GSM3140916/3140916_prep_file3.csv",
                     "GSM3140917/3140917_prep_file3.csv", "GSM3140918/3140918_prep_file3.csv",
                     "GSM3140919/3140919_prep_file3.csv", "GSM3140920/3140920_prep_file3.csv",
                     "GSM3195456/3195456_prep_file3.csv", "GSM3488509/3488509_prep_file3.csv",
                     "GSM3852752/3852752_prep_file3.csv", "GSM3852753/3852753_prep_file3.csv",
                     "GSM3852754/3852754_prep_file3.csv", "GSM3852755/3852755_prep_file3.csv"]

colors = ["red", "blue", "green", "yellow", "orange", "black", "brown"]

# KMeans

In [3]:
def km(filename, num_of_clus):
    
    labels.index = data.index
    tsne_data = pd.read_csv(filename[:19] + "tsne_file.csv", index_col = 0)
    
    model = KMeans(num_of_clus)
    model.fit(data)
    
    tsne_data["labels"] = model.labels_
    data["labels"] = model.labels_
    fig = plt.figure(figsize= (20,15))
    for j in range(0, num_of_clus):
        plt.scatter(tsne_data[tsne_data.columns[0]][tsne_data['labels'] == j],
                    tsne_data[tsne_data.columns[1]][tsne_data['labels'] == j], c=colors[j])
    plt.title('KMeans: ' +str(num_of_clus) +' Silhouette Score:' + str(silhouette_score(data, tsne_data['labels']))+
              ', CH Score: ' + str(calinski_harabasz_score(data, tsne_data['labels']))+ 
              ', DB Score: ' + str(davies_bouldin_score(data, tsne_data['labels'])))
    fig.savefig(filename[:11]+ "KM/" + filename[11:19] + "KM_" + str(num_of_clus))
    labels[str(num_of_clus)] = model.labels_
    
    for i in data.columns:
        if np.sum(data.loc[:, i] > 0) < 1:
            data.drop(i, axis =1, inplace =True)
    desc = pd.DataFrame()
    for j in range(0, num_of_clus):
        df = data.loc[data["labels"] == j].describe().rename(index = lambda x: str(j) + ".cluster_" + x)
        desc = desc.append(df)
        desc.to_csv(filename[:11]+ "KM/KM_" + str(num_of_clus) + ".csv")

In [None]:
for l in list_of_filepaths:
    data = pd.read_csv(l, index_col = 0)
    labels = pd.DataFrame(columns = ["2", "3", "4", "5", "6", "7"], index = data.index)
    for i in range(2, 8):
        km(l, i)
    labels.to_csv(l[:11]+ "KM/" + l[11:19]+ "labels_KM.csv")

## Spectral clustering

In [5]:
def spec(filename, num_of_clus):
    
    model = SpectralClustering(num_of_clus, affinity='nearest_neighbors')
    tsne_data = pd.read_csv(filename[:19] + "tsne_file.csv", index_col = 0)
    
    model.fit(data.values)
    tsne_data["labels"] = model.labels_
    data["labels"] = model.labels_
    
    fig = plt.figure(figsize= (20,15))
    for j in range(0, num_of_clus):
        plt.scatter(tsne_data[tsne_data.columns[0]][tsne_data['labels'] == j],
                    tsne_data[tsne_data.columns[1]][tsne_data['labels'] == j], c=colors[j])
    plt.title('Spectral clus: ' +str(num_of_clus) +' Silhouette Score:' +str(silhouette_score(data, tsne_data['labels']))+
              ', CH Score: ' + str(calinski_harabasz_score(data, tsne_data['labels']))+ 
              ', DB Score: ' + str(davies_bouldin_score(data, tsne_data['labels'])))
    fig.savefig(filename[:11]+ "Spec/" + filename[11:19] + "Spec_" + str(num_of_clus))
    labels[str(num_of_clus)] = model.labels_
    
    for i in data.columns:
        if np.sum(data.loc[:, i] > 0) < 1:
            data.drop(i, axis =1, inplace =True)
    desc = pd.DataFrame()
    for j in range(0, num_of_clus):
        df = data.loc[data["labels"] == j].describe().rename(index = lambda x: str(j) + ".cluster_" + x)
        desc = desc.append(df)
        desc.to_csv(filename[:11]+ "Spec/Spec_" + str(num_of_clus) + ".csv")

In [None]:
for l in list_of_filepaths:
    data = pd.read_csv(l, index_col = 0)
    labels = pd.DataFrame(columns = ["2", "3", "4", "5", "6", "7"], index = data.index)
    for i in range(2, 8):
        spec(l, i)
    labels.to_csv(l[:11]+ "Spec/" + l[11:19]+ "labels_spec.csv")

# Po grupama

In [9]:
groups = ["grupe/1.grupa/prep_group.csv", "grupe/3.grupa/prep_group.csv",
          "grupe/4.grupa/prep_group.csv", "grupe/2.grupa/prep_group.csv"]

In [10]:
for filename in groups:
    for num_of_clus in range(2,6):
        model = KMeans(num_of_clus)
        data = pd.read_csv(filename)
        data = data.set_index("Unnamed: 0")
        #data = data.set_index("id")
        data.index.name = None
        #tsne_data = pd.read_csv(filename[:19] + "tsne_file.csv")
        tsne_data = pd.read_csv(filename[:14] + "tsne_file.csv")
        tsne_data = tsne_data.drop("Unnamed: 0", axis =1)
        model.fit(data)
        tsne_data["labels"] = model.labels_
        data["labels"] = model.labels_
        fig = plt.figure(figsize= (20,15))
        for j in range(0, num_of_clus):
            plt.scatter(tsne_data[tsne_data.columns[0]][tsne_data['labels'] == j],
                        tsne_data[tsne_data.columns[1]][tsne_data['labels'] == j], c=colors[j])
        plt.title('KMeans: ' +str(num_of_clus) +' Silhouette Score:' +str(silhouette_score(data, tsne_data['labels']))+
                  ', CH Score: ' + str(calinski_harabasz_score(data, tsne_data['labels']))+ 
                  ', DB Score: ' + str(davies_bouldin_score(data, tsne_data['labels'])))
        plt.show()
        #fig.savefig(filename[:11]+ "KM/" + filename[11:19] + "KM_" + str(num_of_clus))
        fig.savefig(filename[:14]+ "KM/KM_" + str(num_of_clus))
        labels["KM_"+str(num_of_clus)] = tsne_data["labels"]

        for i in data.columns:
            if np.sum(data.loc[:, i] > 0) < 1:
                data.drop(i, axis =1, inplace =True)
        desc = pd.DataFrame()
        for j in range(0, num_of_clus):
            df = data.loc[data["labels"] == j].describe().rename(index = lambda x: str(j) + ".cluster_" + x)
            desc = desc.append(df)
            desc.to_csv(filename[:14]+ "KM/KM_" + str(num_of_clus) + ".csv")
    labels.to_csv(filename[:14]+ "KM/labels_spec.csv")

MemoryError: Unable to allocate 1.31 GiB for an array with shape (14435, 12208) and data type float64