# Klasterovanje po grupama

In [None]:
import numpy as np
import pandas as pd

group_files = [["GSM2699154/2699154_prep_file3.csv", "GSM2699155/2699155_prep_file3.csv",
               "GSM3140916/3140916_prep_file3.csv"],
              ["GSM2699157/2699157_prep_file3.csv", "GSM3140917/3140917_prep_file3.csv",
               "GSM3140918/3140918_prep_file3.csv"],
              ["GSM3195456/3195456_prep_file3.csv", "GSM3488509/3488509_prep_file3.csv"],
               ["GSM3852752/3852752_prep_file3.csv", "GSM3852753/3852753_prep_file3.csv",
                 "GSM3852754/3852754_prep_file3.csv", "GSM3852755/3852755_prep_file3.csv"],
                ["GSM3140915/3140915_prep_file3.csv", "GSM3140916/3140916_prep_file3.csv",
                 "GSM3140917/3140917_prep_file3.csv", "GSM3140918/3140918_prep_file3.csv"]]

group_filepaths = ["grupe/14.dan_1/", "grupe/17.dan/", "grupe/14.dan_2/",
                   "grupe/mesavina1/", "grupe/mesavina2/"]

## Pravljenje fajlova

In [None]:
for i in range(0, len(group_files)):
    df = pd.DataFrame()
    for f in group_files[i]:
        df = df.append(pd.read_csv(f, index_col = 0))
    df.to_csv(group_filepaths[i]+"group.csv")

## Pravljenje tsne

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

for i in range(0, len(group_files)):
    df = pd.read_csv(group_filepaths[i]+"group.csv", index_col = 0)
    pca = PCA()
    data_pca = pd.DataFrame(pca.fit_transform(df))
    for l in range(1, len(df.columns), 50):
        if pca.explained_variance_ratio_[:l].sum() >= 0.95:
            x = l
            break
    tsne_data = TSNE(n_iter = 2000).fit_transform(data_pca[data_pca.columns[:x]])
    tsne_data = pd.DataFrame(tsne_data)
    tsne_data.set_index(df.index, inplace = True)
    tsne_data.to_csv(group_filepaths[i] + "tsne_file.csv")

## Klasterovanje

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


"""ocene_ss = pd.DataFrame(index = list(map(lambda x : x[6:-1], group_filepaths)),
                     columns = ["2", "3", "4", "5", "6", "7"])
ocene_db = pd.DataFrame(index = list(map(lambda x : x[6:-1], group_filepaths)),
                     columns = ["2", "3", "4", "5", "6", "7"])
ocene_ch = pd.DataFrame(index = list(map(lambda x : x[6:-1], group_filepaths)),
                     columns = ["2", "3", "4", "5", "6", "7"])"""

ocene_ss = pd.read_csv("Ocene/km_grupe_ss.csv", index_col = 0).append(
    pd.DataFrame(index = list(map(lambda x : x[6:-1], group_filepaths)),columns = ["2", "3", "4", "5", "6", "7"]))
ocene_db = pd.read_csv("Ocene/km_grupe_db.csv", index_col = 0).append(
    pd.DataFrame(index = list(map(lambda x : x[6:-1], group_filepaths)),columns = ["2", "3", "4", "5", "6", "7"]))
ocene_ch = pd.read_csv("Ocene/km_grupe_ch.csv", index_col = 0).append(
    pd.DataFrame(index = list(map(lambda x : x[6:-1], group_filepaths)),columns = ["2", "3", "4", "5", "6", "7"]))

#ocene_ss = pd.read_csv("Ocene/spec_grupe_ss.csv", index_col = 0).append(
#    pd.DataFrame(index = list(map(lambda x : x[6:-1], group_filepaths)),columns = ["2", "3", "4", "5", "6", "7"]))
#ocene_db = pd.read_csv("Ocene/spec_grupe_db.csv", index_col = 0).append(
#    pd.DataFrame(index = list(map(lambda x : x[6:-1], group_filepaths)),columns = ["2", "3", "4", "5", "6", "7"]))
#ocene_ch = pd.read_csv("Ocene/spec_grupe_ch.csv", index_col = 0).append(
#    pd.DataFrame(index = list(map(lambda x : x[6:-1], group_filepaths)),columns = ["2", "3", "4", "5", "6", "7"]))

colors = ["red", "blue", "green", "yellow", "orange", "black", "purple"]

for i in range(0, len(group_files)):
    
    df = pd.read_csv(group_filepaths[i]+"group.csv", index_col = 0)
    tsne_data = pd.read_csv(group_filepaths[i] + "tsne_file.csv", index_col = 0)

    labels = pd.DataFrame(index = df.index, columns = ["2", "3", "4", "5", "6", "7"])
    for num_of_clus in range(2, 8):
        
        model = KMeans(num_of_clus)
        model.fit(df)
        #model = SpectralClustering(num_of_clus, affinity='nearest_neighbors')
        #model.fit(df.values)
        
        fig = plt.figure(figsize= (20,15))
        for j in range(0, num_of_clus):
            plt.scatter(tsne_data.loc[model.labels_ == j][tsne_data.columns[0]],
                        tsne_data.loc[model.labels_ == j][tsne_data.columns[1]], c=colors[j])
            
        ocene_ss.at[group_filepaths[i][6:-1], str(num_of_clus)] = silhouette_score(df, model.labels_)
        ocene_db.at[group_filepaths[i][6:-1], str(num_of_clus)] = davies_bouldin_score(df, model.labels_)
        ocene_ch.at[group_filepaths[i][6:-1], str(num_of_clus)] = calinski_harabasz_score(df, model.labels_)
        
        plt.title('KM: ' +str(num_of_clus) +
                  ' Silhouette Score:' +str(ocene_ss.at[group_filepaths[i][6:-1], str(num_of_clus)])+
                  ', CH Score: ' + str(ocene_ch.at[group_filepaths[i][6:-1], str(num_of_clus)])+ 
                  ', DB Score: ' + str(ocene_db.at[group_filepaths[i][6:-1], str(num_of_clus)]))
        #plt.title('Spektralno: ' +str(num_of_clus) +
        #          ' Silhouette Score:' +str(ocene_ss.at[group_filepaths[i][6:-1], str(num_of_clus)])+
        #          ', CH Score: ' + str(ocene_ch.at[group_filepaths[i][6:-1], str(num_of_clus)])+ 
        #          ', DB Score: ' + str(ocene_db.at[group_filepaths[i][6:-1], str(num_of_clus)]))
        
        plt.show()
        
        fig.savefig(group_filepaths[i]+ "KM/KM_" + str(num_of_clus))
        #fig.savefig(group_filepaths[i]+ "Spec/Spec_" + str(num_of_clus))
        
        labels[str(num_of_clus)] = model.labels_
        
    labels.to_csv(group_filepaths[i]+ "KM/labels.csv")
    #labels.to_csv(group_filepaths[i]+ "Spec/labels.csv")
    
ocene_ss.to_csv("Ocene/km_grupe_ss.csv")
ocene_db.to_csv("Ocene/km_grupe_db.csv")
ocene_ch.to_csv("Ocene/km_grupe_ch.csv")

#ocene_ss.to_csv("Ocene/spec_grupe_ss.csv")
#ocene_db.to_csv("Ocene/spec_grupe_db.csv")
#ocene_ch.to_csv("Ocene/spec_grupe_ch.csv")