In [137]:
# ML for clustering
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, OPTICS, SpectralClustering, MeanShift, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.mixture import GaussianMixture

# Function to prepend a string to each row
def prepend_string(row, prefix):
    return prefix + str(row)


def kmeans_AL(df):
    # Create a copy of the original DataFrame
    data = df.copy()
    kmeans = KMeans(n_clusters=3, n_init=10)
    kmeans.fit(data)
    labels = kmeans.labels_
    data['labels'] = labels
    data['labels_tagged'] = data['labels'].apply(prepend_string, args=("Group ",))
    # Get the cluster centers
    centers = kmeans.cluster_centers_
    num_clusters = len(np.unique(labels))
    return data, labels, np.unique(labels), num_clusters

def agglomerative_clustering_AL(df):
    # Create a copy of the original DataFrame
    data = df.copy()
    hierarchical = AgglomerativeClustering(n_clusters=3)
    labels = hierarchical.fit_predict(data)
    data['labels'] = labels
    data['labels_tagged'] = data['labels'].apply(prepend_string, args=("Group ",))
    num_clusters = len(np.unique(labels))
    return data, labels, np.unique(labels), num_clusters

def DBSCAN_AL(df):
    # Create a copy of the original DataFrame
    data = df.copy()
    dbscan = DBSCAN(eps=0.3, min_samples=5)
    labels = dbscan.fit_predict(data)
    data['labels'] = labels
    data['labels_tagged'] = data['labels'].apply(prepend_string, args=("Group ",))
    num_clusters = len(np.unique(labels))
    return data, labels, np.unique(labels), num_clusters

def mean_shift(df):
    # Create a copy of the original DataFrame
    data = df.copy()
    meanshift = MeanShift()
    labels = meanshift.fit_predict(data)
    data['labels'] = labels
    data['labels_tagged'] = data['labels'].apply(prepend_string, args=("Group ",))
    num_clusters = len(np.unique(labels))
    return data, labels, np.unique(labels), num_clusters
    
def gaussian_AL(df):
    # Create a copy of the original DataFrame
    data = df.copy()
    gmm = GaussianMixture(n_components=3)
    labels = gmm.fit_predict(data)
    data['labels'] = labels
    data['labels_tagged'] = data['labels'].apply(prepend_string, args=("Group ",))
    num_clusters = len(np.unique(labels))
    return data, labels, np.unique(labels), num_clusters

def spectral_AL(df):
    # Create a copy of the original DataFrame
    data = df.copy()
    spectral = SpectralClustering(n_clusters=3)
    labels = spectral.fit_predict(data)
    data['labels'] = labels
    data['labels_tagged'] = data['labels'].apply(prepend_string, args=("Group ",))
    num_clusters = len(np.unique(labels))
    return data, labels, np.unique(labels), num_clusters

def optics_AL(df):
    # Create a copy of the original DataFrame
    data = df.copy()
    optics = OPTICS(min_samples=5, xi=0.05)
    labels = optics.fit_predict(data)
    data['labels'] = labels
    data['labels_tagged'] = data['labels'].apply(prepend_string, args=("Group ",))
    num_clusters = len(np.unique(labels))
    return data, labels, np.unique(labels), num_clusters




def create_dir(directory_path):
    import os
    # Check if the directory exists, and if not, create it
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
        print(f"Directory '{directory_path}' created successfully.")
    else:
        print(f"Directory '{directory_path}' already exists.")


In [149]:
fastICA_minmax = pd.read_csv("./dimension_reduction_datasets/minmax_data_FastICA/FastICAminmax_data.csv", low_memory=False)
fastICA_maxabs = pd.read_csv("./dimension_reduction_datasets/maxabs_data_FastICA/FastICAmaxabs_data.csv", low_memory=False)
fastICA_robust = pd.read_csv("./dimension_reduction_datasets/robust_data_FastICA/FastICArobust_data.csv", low_memory=False)
fastICA_standard = pd.read_csv("./dimension_reduction_datasets/standard_scaler_FastICA/FastICAstandard_scaler.csv", low_memory=False)

In [150]:

datasets = {
    "fastICA_minmax": fastICA_minmax,
    "fastICA_maxabs": fastICA_maxabs,
    "fastICA_robust": fastICA_robust,
    "fastICA_standard": fastICA_standard,
}


In [154]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import random
import altair as alt

# Initialize ML techniques
ML_techniques = {
    'kmeans_AL': 'kmeans_AL',
    'DBSCAN_AL': 'DBSCAN_AL',
    'mean_shift': 'mean_shift',
    'gaussian_AL': 'gaussian_AL',
    'optics_AL': 'optics_AL',
    'agglomerative_clustering_AL': 'agglomerative_clustering_AL',
}

charts = []

for i, dataset in enumerate(datasets):
    for technique_name, technique in ML_techniques.items():
        create_dir("cluster_charts/" + dataset+"_"+technique_name+"/charts/")
        create_dir("cluster_datasets/" + dataset+"_"+technique_name+"/datasets/")
        clustered_data, labels, comp, compp = eval(technique)(datasets[dataset])
        print(compp)
        # Define specific colors for each label
        color_tag = ['#93C4F6', '#005EB8', '#D9DE84', '#636B05']
        if(compp > 1):
            color_scale = alt.Scale(domain=np.unique(clustereds_data["labels_tagged"]), range=color_tag)
            clustered_data.to_csv("cluster_datasets/" + dataset+"_"+technique_name+"/datasets/"+technique_name + '_clustering.csv')
            # Plot the data points and cluster centers
            clustered_data
            
            # Evaluate clustering using different metrics
            silhouette = silhouette_score(clustered_data[["PCA1", "PCA2", "labels"]], labels)
            db_index = davies_bouldin_score(clustered_data[["PCA1", "PCA2", "labels"]], labels)
            ch_score = calinski_harabasz_score(clustered_data[["PCA1", "PCA2", "labels"]], labels)
            
            chart = alt.Chart(clustered_data).mark_point().encode(
                x='PCA1:Q',
                y='PCA2:Q',
                color=alt.Color('labels_tagged:N', scale=color_scale, legend=alt.Legend(title="Clusters")),  # Use the defined color scale
                tooltip=['PCA1', 'PCA2', 'labels_tagged'],  # Add tooltip information
                # text="{'silhouette':" + str(silhouette) + ", 'davies_bouldin_score':" + str(db_index) + ", 'calinski_harabasz_score':" + str(ch_score) + "}"
            ).properties(title=f'{technique_name} on {dataset.replace("_", " ")}')
            # charts.append(chart)
            # combine_chart = alt.hconcat(*charts)
            # combine_chart
            
            chart.save("cluster_charts/" + dataset+"_"+technique_name+"/charts/"+dataset + 'HIGH.png', engine="vl-convert", ppi=200, format='png')

    
            # Print the evaluation results
            print(f"Silhouette Score: {silhouette}")
            print(f"Davies-Bouldin Index: {db_index}")
            print(f"Calinski-Harabasz Index: {ch_score}")
            # print(f"Adjusted Rand Index: {ari_score}")
            # print(f"Normalized Mutual Information: {nmi_score}")

Directory 'cluster_charts/fastICA_minmax_kmeans_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_minmax_kmeans_AL/datasets/' already exists.
3


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.42124459756047933
Davies-Bouldin Index: 1.2416636990730072
Calinski-Harabasz Index: 1925.572276337925
Directory 'cluster_charts/fastICA_minmax_DBSCAN_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_minmax_DBSCAN_AL/datasets/' already exists.
1
Directory 'cluster_charts/fastICA_minmax_mean_shift/charts/' already exists.
Directory 'cluster_datasets/fastICA_minmax_mean_shift/datasets/' already exists.
4


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.32670300976989164
Davies-Bouldin Index: 1.6406980888329512
Calinski-Harabasz Index: 2092.559048682312
Directory 'cluster_charts/fastICA_minmax_gaussian_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_minmax_gaussian_AL/datasets/' already exists.
3


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.4196474927211815
Davies-Bouldin Index: 1.1346189933849102
Calinski-Harabasz Index: 1725.3211798380637
Directory 'cluster_charts/fastICA_minmax_optics_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_minmax_optics_AL/datasets/' already exists.
186


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.45599703895186544
Davies-Bouldin Index: 0.7513660793701968
Calinski-Harabasz Index: 49382.82241141426
Directory 'cluster_charts/fastICA_minmax_agglomerative_clustering_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_minmax_agglomerative_clustering_AL/datasets/' already exists.
3


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.4240333188822287
Davies-Bouldin Index: 1.2167234010021528
Calinski-Harabasz Index: 1943.0340443223977
Directory 'cluster_charts/fastICA_maxabs_kmeans_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_maxabs_kmeans_AL/datasets/' already exists.
3


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.4327711254286734
Davies-Bouldin Index: 1.2179776209520738
Calinski-Harabasz Index: 1926.3791650116254
Directory 'cluster_charts/fastICA_maxabs_DBSCAN_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_maxabs_DBSCAN_AL/datasets/' already exists.
1
Directory 'cluster_charts/fastICA_maxabs_mean_shift/charts/' already exists.
Directory 'cluster_datasets/fastICA_maxabs_mean_shift/datasets/' already exists.
4


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.32717459796500925
Davies-Bouldin Index: 1.6398659330795518
Calinski-Harabasz Index: 2096.46159769648
Directory 'cluster_charts/fastICA_maxabs_gaussian_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_maxabs_gaussian_AL/datasets/' already exists.
3


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.4199336779417408
Davies-Bouldin Index: 1.133677690130747
Calinski-Harabasz Index: 1725.8621069899298
Directory 'cluster_charts/fastICA_maxabs_optics_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_maxabs_optics_AL/datasets/' already exists.
185


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.45578333262770415
Davies-Bouldin Index: 0.7519576912734129
Calinski-Harabasz Index: 48891.43195810022
Directory 'cluster_charts/fastICA_maxabs_agglomerative_clustering_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_maxabs_agglomerative_clustering_AL/datasets/' already exists.
3


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.42764835147554714
Davies-Bouldin Index: 1.3103509631730743
Calinski-Harabasz Index: 1853.8232479843005
Directory 'cluster_charts/fastICA_robust_kmeans_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_robust_kmeans_AL/datasets/' already exists.
3


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.7696737003405523
Davies-Bouldin Index: 0.5266894375107269
Calinski-Harabasz Index: 523.1194096370862
Directory 'cluster_charts/fastICA_robust_DBSCAN_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_robust_DBSCAN_AL/datasets/' already exists.
1
Directory 'cluster_charts/fastICA_robust_mean_shift/charts/' already exists.
Directory 'cluster_datasets/fastICA_robust_mean_shift/datasets/' already exists.
4


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.767139471618458
Davies-Bouldin Index: 0.5025816455329127
Calinski-Harabasz Index: 795.3399117053226
Directory 'cluster_charts/fastICA_robust_gaussian_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_robust_gaussian_AL/datasets/' already exists.
3


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.8870319431966655
Davies-Bouldin Index: 1.6924939335053395
Calinski-Harabasz Index: 405.03791497094
Directory 'cluster_charts/fastICA_robust_optics_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_robust_optics_AL/datasets/' already exists.
40


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.775121562670995
Davies-Bouldin Index: 0.3187923083628418
Calinski-Harabasz Index: 5421.41962029137
Directory 'cluster_charts/fastICA_robust_agglomerative_clustering_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_robust_agglomerative_clustering_AL/datasets/' already exists.
3


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.7689361305139772
Davies-Bouldin Index: 0.4822423078619728
Calinski-Harabasz Index: 537.8339699102701
Directory 'cluster_charts/fastICA_standard_kmeans_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_standard_kmeans_AL/datasets/' already exists.
3


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.28650636834096804
Davies-Bouldin Index: 1.5816455110014946
Calinski-Harabasz Index: 649.2947099569641
Directory 'cluster_charts/fastICA_standard_DBSCAN_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_standard_DBSCAN_AL/datasets/' already exists.
1
Directory 'cluster_charts/fastICA_standard_mean_shift/charts/' already exists.
Directory 'cluster_datasets/fastICA_standard_mean_shift/datasets/' already exists.
4


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.23138496850943238
Davies-Bouldin Index: 1.3070925968832507
Calinski-Harabasz Index: 992.5969316704015
Directory 'cluster_charts/fastICA_standard_gaussian_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_standard_gaussian_AL/datasets/' already exists.
3


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.562437737062219
Davies-Bouldin Index: 1.2202286213574116
Calinski-Harabasz Index: 1347.940220484902
Directory 'cluster_charts/fastICA_standard_optics_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_standard_optics_AL/datasets/' already exists.
84


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.35181572384781434
Davies-Bouldin Index: 0.8700262568410838
Calinski-Harabasz Index: 17385.012378635034
Directory 'cluster_charts/fastICA_standard_agglomerative_clustering_AL/charts/' already exists.
Directory 'cluster_datasets/fastICA_standard_agglomerative_clustering_AL/datasets/' already exists.
3


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Silhouette Score: 0.3492878540272359
Davies-Bouldin Index: 1.6327836566537304
Calinski-Harabasz Index: 612.9230540295215


In [132]:
clustereds_data, labels, comp, _ = kmeans_AL(fastICA_minmax)

In [136]:
color_tag = ['#93C4F6', '#005EB8']
color_scale = alt.Scale(domain=np.unique(clustereds_data["labels_tagged"]), range=color_tag)
chart = alt.Chart(clustereds_data).mark_point().encode(
    x='PCA1:Q',
    y='PCA2:Q',
    color=alt.Color('labels_tagged:N', scale=color_scale, legend=alt.Legend(title="Clusters")),  # Use the defined color scale
    tooltip=['PCA1', 'PCA2', 'labels_tagged'],  # Add tooltip information
    # text="{'silhouette':" + str(silhouette) + ", 'davies_bouldin_score':" + str(db_index) + ", 'calinski_harabasz_score':" + str(ch_score) + "}"
).properties(title=f'Wkekke')
chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [126]:
clustereds_data

Unnamed: 0.1,Unnamed: 0,PCA1,PCA2,labels,labels_tagged
0,0,-1.580037,0.861025,0,Group 0
1,1,-1.371473,-1.173119,0,Group 0
2,2,-1.420406,-1.163133,0,Group 0
3,3,-1.366488,-1.181727,0,Group 0
4,4,-1.465478,0.827001,0,Group 0
...,...,...,...,...,...
3130,3130,2.095128,0.717940,1,Group 1
3131,3131,2.061818,-1.268212,1,Group 1
3132,3132,2.200256,0.708712,1,Group 1
3133,3133,2.029491,0.747233,1,Group 1


In [134]:
fastICA_minmax

Unnamed: 0.1,Unnamed: 0,PCA1,PCA2
0,0,-1.580037,0.861025
1,1,-1.371473,-1.173119
2,2,-1.420406,-1.163133
3,3,-1.366488,-1.181727
4,4,-1.465478,0.827001
...,...,...,...
3130,3130,2.095128,0.717940
3131,3131,2.061818,-1.268212
3132,3132,2.200256,0.708712
3133,3133,2.029491,0.747233


In [26]:
import random
import altair as alt
import pandas as pd

import matplotlib.colors as mcolors
import matplotlib.pyplot as plt

def generate_color_palette(start_color, end_color, num_colors):
    # Convert hex colors to RGB
    start_rgb = mcolors.hex2color(start_color)
    end_rgb = mcolors.hex2color(end_color)

    # Create a list of RGB colors in the gradient
    colors = []
    for i in range(num_colors):
        r = start_rgb[0] + (end_rgb[0] - start_rgb[0]) * (i / (num_colors - 1))
        g = start_rgb[1] + (end_rgb[1] - start_rgb[1]) * (i / (num_colors - 1))
        b = start_rgb[2] + (end_rgb[2] - start_rgb[2]) * (i / (num_colors - 1))
        colors.append((r, g, b))

    # Convert RGB colors back to hex
    hex_colors = [mcolors.rgb2hex(color) for color in colors]

    return hex_colors
    
# Entries over Time-Visualitaion
protein_db = pd.read_csv("../Quantitative_data.csv", low_memory=False)

d = pd.crosstab(protein_db.bibliography_year, columns=protein_db.Group).cumsum()

d = d.stack().reset_index()
d = d.rename(columns={0:'CummulativeCount'})
d = d.convert_dtypes()
d.dtypes
# Define a custom color palette
start_color = '#005EB8'  # Red
end_color = '#B87200'    # Green
color_list = ['#93C4F6', '#005EB8', '#FFD797', '#D9DE84']
# Generate a color palette with 10 colors
num_colors = len(list(protein_db['Group'].unique()))
palette = generate_color_palette(start_color, end_color, num_colors)
random.shuffle(palette)
custom_palette = alt.Scale(domain=list(protein_db['Group'].unique()),
                           range=color_list[:num_colors])
entries_over_time = alt.Chart(d).mark_bar(size=15, opacity=0.9).encode(
    x=alt.X('bibliography_year:O', title="Year since first structure (1985)"),
    y=alt.Y('CummulativeCount:Q', title = 'Cummulative unique MP structures'),
    color=alt.Color('Group', scale=custom_palette, legend=alt.Legend(title="MP Categories", labelLimit=0)),
    tooltip=[alt.Tooltip('CummulativeCount:Q'),
            alt.Tooltip('Group'),
            alt.Tooltip('bibliography_year:O')]
).properties(
    title="Cumulative number of resolved Membrane protein overtime"
).configure_legend(
    orient='bottom',  # Change the orientation of the legend
    offset=2         # Adjust the offset of the legend
)


no_of_entries = protein_db.shape[0]
name_of_features = list(protein_db.columns)
features_as_str = ', '.join(name_of_features).lower()
db_types_len = len(list(protein_db['Group'].unique()))
db_types_as_str = ', '.join(list(protein_db['Group'].unique())).lower()
print(list(protein_db['Group'].unique()))

# reso_over_time_mean = protein_db[['Group', 'Resolution', 'bibliography_year']].groupby('bibliography_year').mean().reset_index()

# reso_over_time_mean

# db_type_reso = protein_db[['Group', 'Resolution']].groupby(['Group'], as_index = False).mean()

# mean_reso = alt.Chart(reso_over_time_mean).mark_line().encode(
#     y = alt.Y('Resolution:Q',scale=alt.Scale(domain=(2, 4))),
#     x = 'bibliography_year:O'
# )


# no_of_monotopic_proteins = protein_db[protein_db['Group'] == 'MONOTOPIC MEMBRANE PROTEINS'].shape[0]
# mono_prot_ratio = round((no_of_monotopic_proteins / no_of_entries) * 100,2)

# no_of_alpha_proteins = protein_db[protein_db['Group'] == 'TRANSMEMBRANE PROTEINS: ALPHA-HELICAL'].shape[0]
# alpha_prot_ratio = round((no_of_alpha_proteins / no_of_entries) * 100,2)

# no_of_beta_proteins = protein_db[protein_db['Group'] == 'TRANSMEMBRANE PROTEINS: BETA-BARREL'].shape[0]
# beta_prot_ratio = round((no_of_beta_proteins / no_of_entries) * 100,2)

# pd.DataFrame({'ratio':[mono_prot_ratio,alpha_prot_ratio, beta_prot_ratio]})

# mono_text = '''
# Out of {0} database entries there are {1} monotopic membrane proteins, so {2}%. \n
# Their mean resolution is:
# '''.format(no_of_entries, no_of_monotopic_proteins, round(mono_prot_ratio, 2))

# mono_report = pn.Column(
# mono_text
# )
# mono_report
# protein_db.head(3)




entries_over_time
entries_over_time.save('ResolvedMPHIGH.png', engine="vl-convert", ppi=200, format='png')





['MONOTOPIC MEMBRANE PROTEINS', 'TRANSMEMBRANE PROTEINS: ALPHA-HELICAL', 'TRANSMEMBRANE PROTEINS: BETA-BARREL']


In [8]:
d

Unnamed: 0,bibliography_year,Group,CummulativeCount
0,1985,MONOTOPIC MEMBRANE PROTEINS,0
1,1985,TRANSMEMBRANE PROTEINS: ALPHA-HELICAL,1
2,1985,TRANSMEMBRANE PROTEINS: BETA-BARREL,0
3,1987,MONOTOPIC MEMBRANE PROTEINS,0
4,1987,TRANSMEMBRANE PROTEINS: ALPHA-HELICAL,2
...,...,...,...
103,2022,TRANSMEMBRANE PROTEINS: ALPHA-HELICAL,2578
104,2022,TRANSMEMBRANE PROTEINS: BETA-BARREL,372
105,2023,MONOTOPIC MEMBRANE PROTEINS,123
106,2023,TRANSMEMBRANE PROTEINS: ALPHA-HELICAL,2637
