In [1]:
!pip install scikit-learn



In [2]:
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, MeanShift, MiniBatchKMeans, Birch, AffinityPropagation, AgglomerativeClustering
from sklearn import metrics
import numpy as np

In [3]:
def get_performance_clustering(data, labels):
    siluetas = metrics.silhouette_score(data, labels, metric='euclidean')
    calinski = metrics.calinski_harabasz_score(data, labels)
    davies = metrics.davies_bouldin_score(data, labels)

    return siluetas, calinski, davies

In [23]:
dataset = {
    0 : "absorption",
    1 : "enantioselectivity",
    2 : "localization",
    3 : "T50"
}
method = {
    0:"FFT",
    1:"NLP",
    2:"Properties"
}
bioembedding = {
    0:"bepler",
    1:"esm",
    2:"fasttext",
    3:"plus_rnn",
    4:"prottrans"
}
distances = {
    1 : "Euclidean",
    2 : "Braycurtis",
    3 : "Canberra",
    4 : "Chebyshev",
    5 : "Cityblock",
    6 : "Correlation",
    7 : "Cosine",
    8 : "Minkowski",
    9 : "Hamming"
}

resultados = [""]

In [24]:
for a in range(0,4):
    for b in range(0,3):
        if b == 1:
            for c in range(0,5):
                if a == 0:
                    df_data = pd.read_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/"+bioembedding[c]+"/"+bioembedding[c]+"-absortion.csv")
                else:
                    df_data = pd.read_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/"+bioembedding[c]+"/"+bioembedding[c]+"-"+dataset[a]+".csv")
                ignore_columns = pd.DataFrame()
                ignore_columns['id'] = df_data['id']
                ignore_columns['target'] = df_data['target']

                df_data = df_data.drop(columns=['id', 'target'])

                '''------------KMEANS-----------------------------------------------------------'''
                df_concat = []
                matrix_result = []
                for k in range(2, 30):
                    kmeans = KMeans(n_clusters=k, random_state=0)
                    kmeans.fit(df_data)
                    siluetas, calinski, davies = get_performance_clustering(df_data, kmeans.labels_)
                    row = ["k-means-{}".format(k), siluetas, calinski, davies]
                    matrix_result.append(row)

                df_explore = pd.DataFrame(matrix_result, columns=['strategy', 'siluetas', 'calinski', 'davies'])
                df_explore.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/"+bioembedding[c]+"/results_kmeans.csv")

                highest_siluetas = np.max(df_explore['siluetas'])
                highest_calinski = np.max(df_explore['calinski'])

                df_filter_by_siluetas = df_explore.loc[df_explore['siluetas'] >= highest_siluetas]
                df_filter_by_calinski = df_explore.loc[df_explore['calinski'] >= highest_calinski]

                df_concat = pd.concat([df_filter_by_siluetas, df_filter_by_calinski])
                strategies = df_concat['strategy'].unique()
                frase = dataset[a]+"/"+method[b]+"/"+bioembedding[c]+" dio "+strategies[0]
                resultados.append(frase)
                try:
                    kmeans = KMeans(n_clusters=(int(strategies[0][8]+strategies[0][9])), random_state=0)
                except:
                    kmeans = KMeans(n_clusters=(int(strategies[0][8])), random_state=0)
                kmeans.fit(df_data)
                ignore_columns['labels'] = kmeans.labels_

                ignore_columns.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/"+bioembedding[c]+"/unsupervised_clustering_sequences_kmeans.csv")

                '''------------BIRCH-----------------------------------------------------------'''
                df_concat = []
                matrix_result = []
                for k in range(2, 30):
                    birch = Birch(n_clusters=k, threshold=0.006)
                    birch.fit(df_data)
                    siluetas, calinski, davies = get_performance_clustering(df_data, birch.labels_)
                    row = ["birch-{}".format(k), siluetas, calinski, davies]
                    matrix_result.append(row)

                df_explore = pd.DataFrame(matrix_result, columns=['strategy', 'siluetas', 'calinski', 'davies'])
                df_explore.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/"+bioembedding[c]+"/results_birch.csv")

                highest_siluetas = np.max(df_explore['siluetas'])
                highest_calinski = np.max(df_explore['calinski'])

                df_filter_by_siluetas = df_explore.loc[df_explore['siluetas'] >= highest_siluetas]
                df_filter_by_calinski = df_explore.loc[df_explore['calinski'] >= highest_calinski]

                df_concat = pd.concat([df_filter_by_siluetas, df_filter_by_calinski])
                strategies = df_concat['strategy'].unique()
                frase = dataset[a]+"/"+method[b]+"/"+bioembedding[c]+" dio "+strategies[0]
                resultados.append(frase)
                try:
                    birch = Birch(n_clusters=(int(strategies[0][6]+strategies[0][7])), threshold=0.1)
                except:
                    birch = Birch(n_clusters=(int(strategies[0][6])), threshold=0.1)
                birch.fit(df_data)
                ignore_columns['labels'] = birch.labels_

                ignore_columns.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/"+bioembedding[c]+"/unsupervised_clustering_sequences_birch.csv")

                '''------------AGLOMERATIVE-----------------------------------------------------'''
                df_concat = []
                matrix_result = []
                for k in range(2, 30):
                    aglomerative = AgglomerativeClustering(n_clusters=k)
                    aglomerative.fit(df_data)
                    siluetas, calinski, davies = get_performance_clustering(df_data, aglomerative.labels_)
                    row = ["aglomerative-{}".format(k), siluetas, calinski, davies]
                    matrix_result.append(row)

                df_explore = pd.DataFrame(matrix_result, columns=['strategy', 'siluetas', 'calinski', 'davies'])
                df_explore.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/"+bioembedding[c]+"/results_aglomerative.csv")

                highest_siluetas = np.max(df_explore['siluetas'])
                highest_calinski = np.max(df_explore['calinski'])

                df_filter_by_siluetas = df_explore.loc[df_explore['siluetas'] >= highest_siluetas]
                df_filter_by_calinski = df_explore.loc[df_explore['calinski'] >= highest_calinski]

                df_concat = pd.concat([df_filter_by_siluetas, df_filter_by_calinski])
                strategies = df_concat['strategy'].unique()
                frase = dataset[a]+"/"+method[b]+"/"+bioembedding[c]+" dio "+strategies[0]
                resultados.append(frase)
                try:
                    aglomerative = AgglomerativeClustering(n_clusters=(int(strategies[0][13]+strategies[0][14])))
                except:
                    aglomerative = AgglomerativeClustering(n_clusters=(int(strategies[0][13])))
                aglomerative.fit(df_data)
                ignore_columns['labels'] = aglomerative.labels_

                ignore_columns.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/"+bioembedding[c]+"/unsupervised_clustering_sequences_aglomerative.csv")
        if b == 0:
            for c in range (0,8):
                if a == 0:
                    df_data = pd.read_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/fft-Group_"+str(c)+"-absortion.csv")
                else:
                    df_data = pd.read_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/fft-Group_"+str(c)+"-"+dataset[a]+".csv")
                ignore_columns = pd.DataFrame()
                ignore_columns['id'] = df_data['id']
                ignore_columns['target'] = df_data['target']

                df_data = df_data.drop(columns=['id', 'target'])

                '''------------KMEANS-----------------------------------------------------------'''
                df_concat = []
                matrix_result = []
                for k in range(2, 30):
                    kmeans = KMeans(n_clusters=k, random_state=0)
                    kmeans.fit(df_data)
                    siluetas, calinski, davies = get_performance_clustering(df_data, kmeans.labels_)
                    row = ["k-means-{}".format(k), siluetas, calinski, davies]
                    matrix_result.append(row)

                df_explore = pd.DataFrame(matrix_result, columns=['strategy', 'siluetas', 'calinski', 'davies'])
                df_explore.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/results_kmeans.csv")

                highest_siluetas = np.max(df_explore['siluetas'])
                highest_calinski = np.max(df_explore['calinski'])

                df_filter_by_siluetas = df_explore.loc[df_explore['siluetas'] >= highest_siluetas]
                df_filter_by_calinski = df_explore.loc[df_explore['calinski'] >= highest_calinski]

                df_concat = pd.concat([df_filter_by_siluetas, df_filter_by_calinski])
                strategies = df_concat['strategy'].unique()
                frase = dataset[a]+"/"+method[b]+"/Group"+str(c)+" dio "+strategies[0]
                resultados.append(frase)
                try:
                    kmeans = KMeans(n_clusters=(int(strategies[0][8]+strategies[0][9])), random_state=0)
                except:
                    kmeans = KMeans(n_clusters=(int(strategies[0][8])), random_state=0)
                kmeans.fit(df_data)
                ignore_columns['labels'] = kmeans.labels_

                ignore_columns.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/unsupervised_clustering_sequences_kmeans.csv")

                '''------------BIRCH-----------------------------------------------------------'''
                df_concat = []
                matrix_result = []
                for k in range(2, 30):
                    birch = Birch(n_clusters=k, threshold=0.006)
                    birch.fit(df_data)
                    siluetas, calinski, davies = get_performance_clustering(df_data, birch.labels_)
                    row = ["birch-{}".format(k), siluetas, calinski, davies]
                    matrix_result.append(row)

                df_explore = pd.DataFrame(matrix_result, columns=['strategy', 'siluetas', 'calinski', 'davies'])
                df_explore.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/results_birch.csv")

                highest_siluetas = np.max(df_explore['siluetas'])
                highest_calinski = np.max(df_explore['calinski'])

                df_filter_by_siluetas = df_explore.loc[df_explore['siluetas'] >= highest_siluetas]
                df_filter_by_calinski = df_explore.loc[df_explore['calinski'] >= highest_calinski]

                df_concat = pd.concat([df_filter_by_siluetas, df_filter_by_calinski])
                strategies = df_concat['strategy'].unique()
                frase = dataset[a]+"/"+method[b]+"/Group"+str(c)+" dio "+strategies[0]
                resultados.append(frase)
                try:
                    birch = Birch(n_clusters=(int(strategies[0][6]+strategies[0][7])), threshold=0.1)
                except:
                    birch = Birch(n_clusters=(int(strategies[0][6])), threshold=0.1)
                birch.fit(df_data)
                ignore_columns['labels'] = birch.labels_

                ignore_columns.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/unsupervised_clustering_sequences_birch.csv")

                '''------------AGLOMERATIVE-----------------------------------------------------'''
                df_concat = []
                matrix_result = []
                for k in range(2, 30):
                    aglomerative = AgglomerativeClustering(n_clusters=k)
                    aglomerative.fit(df_data)
                    siluetas, calinski, davies = get_performance_clustering(df_data, aglomerative.labels_)
                    row = ["aglomerative-{}".format(k), siluetas, calinski, davies]
                    matrix_result.append(row)

                df_explore = pd.DataFrame(matrix_result, columns=['strategy', 'siluetas', 'calinski', 'davies'])
                df_explore.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/results_aglomerative.csv")

                highest_siluetas = np.max(df_explore['siluetas'])
                highest_calinski = np.max(df_explore['calinski'])

                df_filter_by_siluetas = df_explore.loc[df_explore['siluetas'] >= highest_siluetas]
                df_filter_by_calinski = df_explore.loc[df_explore['calinski'] >= highest_calinski]

                df_concat = pd.concat([df_filter_by_siluetas, df_filter_by_calinski])
                strategies = df_concat['strategy'].unique()
                frase = dataset[a]+"/"+method[b]+"/Group"+str(c)+" dio "+strategies[0]
                resultados.append(frase)
                try:
                    aglomerative = AgglomerativeClustering(n_clusters=(int(strategies[0][13]+strategies[0][14])))
                except:
                    aglomerative = AgglomerativeClustering(n_clusters=(int(strategies[0][13])))
                aglomerative.fit(df_data)
                ignore_columns['labels'] = aglomerative.labels_

                ignore_columns.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/unsupervised_clustering_sequences_aglomerative.csv")
        if b == 2:
            for c in range(0,8):
                if a == 0:
                    df_data = pd.read_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/physicochemical-Group_"+str(c)+"-absortion.csv")
                else:
                    df_data = pd.read_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/physicochemical-Group_"+str(c)+"-"+dataset[a]+".csv")
                ignore_columns = pd.DataFrame()
                ignore_columns['id'] = df_data['id']
                ignore_columns['target'] = df_data['target']

                df_data = df_data.drop(columns=['id', 'target'])

                '''------------KMEANS-----------------------------------------------------------'''
                df_concat = []
                matrix_result = []
                for k in range(2, 30):
                    kmeans = KMeans(n_clusters=k, random_state=0)
                    kmeans.fit(df_data)
                    siluetas, calinski, davies = get_performance_clustering(df_data, kmeans.labels_)
                    row = ["k-means-{}".format(k), siluetas, calinski, davies]
                    matrix_result.append(row)

                df_explore = pd.DataFrame(matrix_result, columns=['strategy', 'siluetas', 'calinski', 'davies'])
                df_explore.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/results_kmeans.csv")

                highest_siluetas = np.max(df_explore['siluetas'])
                highest_calinski = np.max(df_explore['calinski'])

                df_filter_by_siluetas = df_explore.loc[df_explore['siluetas'] >= highest_siluetas]
                df_filter_by_calinski = df_explore.loc[df_explore['calinski'] >= highest_calinski]

                df_concat = pd.concat([df_filter_by_siluetas, df_filter_by_calinski])
                strategies = df_concat['strategy'].unique()
                frase = dataset[a]+"/"+method[b]+"/Group"+str(c)+" dio "+strategies[0]
                resultados.append(frase)
                try:
                    kmeans = KMeans(n_clusters=(int(strategies[0][8]+strategies[0][9])), random_state=0)
                except:
                    kmeans = KMeans(n_clusters=(int(strategies[0][8])), random_state=0)
                kmeans.fit(df_data)
                ignore_columns['labels'] = kmeans.labels_

                ignore_columns.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/unsupervised_clustering_sequences_kmeans.csv")

                '''------------BIRCH-----------------------------------------------------------'''
                df_concat = []
                matrix_result = []
                for k in range(2, 30):
                    birch = Birch(n_clusters=k, threshold=0.006)
                    birch.fit(df_data)
                    siluetas, calinski, davies = get_performance_clustering(df_data, birch.labels_)
                    row = ["birch-{}".format(k), siluetas, calinski, davies]
                    matrix_result.append(row)

                df_explore = pd.DataFrame(matrix_result, columns=['strategy', 'siluetas', 'calinski', 'davies'])
                df_explore.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/results_birch.csv")

                highest_siluetas = np.max(df_explore['siluetas'])
                highest_calinski = np.max(df_explore['calinski'])

                df_filter_by_siluetas = df_explore.loc[df_explore['siluetas'] >= highest_siluetas]
                df_filter_by_calinski = df_explore.loc[df_explore['calinski'] >= highest_calinski]

                df_concat = pd.concat([df_filter_by_siluetas, df_filter_by_calinski])
                strategies = df_concat['strategy'].unique()
                frase = dataset[a]+"/"+method[b]+"/Group"+str(c)+" dio "+strategies[0]
                resultados.append(frase)
                try:
                    birch = Birch(n_clusters=(int(strategies[0][6]+strategies[0][7])), threshold=0.1)
                except:
                    birch = Birch(n_clusters=(int(strategies[0][6])), threshold=0.1)
                birch.fit(df_data)
                ignore_columns['labels'] = birch.labels_

                ignore_columns.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/unsupervised_clustering_sequences_birch.csv")

                '''------------AGLOMERATIVE-----------------------------------------------------'''
                df_concat = []
                matrix_result = []
                for k in range(2, 30):
                    aglomerative = AgglomerativeClustering(n_clusters=k)
                    aglomerative.fit(df_data)
                    siluetas, calinski, davies = get_performance_clustering(df_data, aglomerative.labels_)
                    row = ["aglomerative-{}".format(k), siluetas, calinski, davies]
                    matrix_result.append(row)

                df_explore = pd.DataFrame(matrix_result, columns=['strategy', 'siluetas', 'calinski', 'davies'])
                df_explore.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/results_aglomerative.csv")

                highest_siluetas = np.max(df_explore['siluetas'])
                highest_calinski = np.max(df_explore['calinski'])

                df_filter_by_siluetas = df_explore.loc[df_explore['siluetas'] >= highest_siluetas]
                df_filter_by_calinski = df_explore.loc[df_explore['calinski'] >= highest_calinski]

                df_concat = pd.concat([df_filter_by_siluetas, df_filter_by_calinski])
                strategies = df_concat['strategy'].unique()
                frase = dataset[a]+"/"+method[b]+"/Group"+str(c)+" dio "+strategies[0]
                resultados.append(frase)
                try:
                    aglomerative = AgglomerativeClustering(n_clusters=(int(strategies[0][13]+strategies[0][14])))
                except:
                    aglomerative = AgglomerativeClustering(n_clusters=(int(strategies[0][13])))
                aglomerative.fit(df_data)
                ignore_columns['labels'] = aglomerative.labels_

                ignore_columns.to_csv("../../results_demo/"+dataset[a]+"/"+method[b]+"/Group_"+str(c)+"/unsupervised_clustering_sequences_aglomerative.csv")




In [25]:
resultados

['',
 'absorption/FFT/Group0 dio k-means-2',
 'absorption/FFT/Group0 dio birch-2',
 'absorption/FFT/Group0 dio aglomerative-2',
 'absorption/FFT/Group1 dio k-means-2',
 'absorption/FFT/Group1 dio birch-2',
 'absorption/FFT/Group1 dio aglomerative-2',
 'absorption/FFT/Group2 dio k-means-2',
 'absorption/FFT/Group2 dio birch-8',
 'absorption/FFT/Group2 dio aglomerative-8',
 'absorption/FFT/Group3 dio k-means-28',
 'absorption/FFT/Group3 dio birch-29',
 'absorption/FFT/Group3 dio aglomerative-29',
 'absorption/FFT/Group4 dio k-means-3',
 'absorption/FFT/Group4 dio birch-2',
 'absorption/FFT/Group4 dio aglomerative-2',
 'absorption/FFT/Group5 dio k-means-2',
 'absorption/FFT/Group5 dio birch-2',
 'absorption/FFT/Group5 dio aglomerative-2',
 'absorption/FFT/Group6 dio k-means-2',
 'absorption/FFT/Group6 dio birch-2',
 'absorption/FFT/Group6 dio aglomerative-2',
 'absorption/FFT/Group7 dio k-means-2',
 'absorption/FFT/Group7 dio birch-2',
 'absorption/FFT/Group7 dio aglomerative-2',
 'absorp