Cyna Shirazinejad, 7/7/21

# Notebook 5: visualize clustering results

outline:

* visualize lifetime cohorts of cmeDNM2+ events
* visualize the lifetime distribution of cmeAnalysisDNM2+ events
* compare the features of events between different model clusters
* * repeat for events within clusters that are cmeDNM2+ events 
* visualize lifetime cohorts of clustered events 
* visualize examples of events within each cluster
* * repeat for examples of events within each cluster that are cmeAnalysisDNM2+
* attempt to predict the identity of events with supervised classifiers for:
* * events that are within the DNM2+ cluster vs. other clusters
* * events within their respective 5 clusters

# import all necessary Python modules

In [None]:
%load_ext autoreload
%autoreload 2
import sys
import pandas as pd
import numpy as np
from scipy import stats
from IPython.display import Image, display
import matplotlib.pyplot as plt
import random
import pickle
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import sklearn.preprocessing as preprocessing
from sklearn.ensemble import RandomForestClassifier
unique_user_path_notebook = str(np.load('unique_user_path_notebook.npy'))
unique_user_saved_outputs = str(np.load('unique_user_saved_outputs.npy'))
sys.path.append(unique_user_path_notebook+'/cmeAnalysisPostProcessingPythonScripts') # add custom Python scripts to the local path
import display_tracks
import return_track_attributes

# load pertinent dataframes from previous notebooks

In [None]:
# load dataframes
df_pcs_normal_scaled_with_gmm_cluster = pd.read_csv(unique_user_saved_outputs+'/dataframes/df_pcs_normal_scaled_with_gmm_cluster.zip')
df_merged_features = pd.read_csv(unique_user_saved_outputs+'/dataframes/df_merged_features.zip')
feature_units = np.load(unique_user_saved_outputs+'/dataframes/feature_units.npy')
number_of_track_splits = np.load(unique_user_saved_outputs+'/dataframes/number_of_track_splits.npy')
number_of_clusters = np.load(unique_user_saved_outputs+"/dataframes/number_of_clusters.npy")
index_DNM2positive = np.load(unique_user_saved_outputs+'/dataframes/cluster_dnm2_positive.npy')

In [None]:
index_DNM2positive

# load all valid tracks

In [None]:
# load all valid tracks
merged_all_valid_tracks = np.load(unique_user_saved_outputs+'/dataframes/merged_all_valid_tracks_0.npy', allow_pickle=True)

for i in range(1,number_of_track_splits):

    merged_all_valid_tracks = np.concatenate((merged_all_valid_tracks,
                                             np.load(unique_user_saved_outputs+'/dataframes/merged_all_valid_tracks_'+str(i)+'.npy', allow_pickle=True)))

# get indices of all valid tracks marked by +/- DNM2 as marked by cmeAnalysis

In [None]:
track_indices_cmeAnalysis_dnm2_negative = df_merged_features[df_merged_features['cmeAnalysis_dynamin2_prediction']==0.0].index.values
track_indices_cmeAnalysis_dnm2_positive = df_merged_features[df_merged_features['cmeAnalysis_dynamin2_prediction']==1.0].index.values

# cohort plots of cmeAnalyis +/- DNM2, binned in cohorts defined above

In [None]:
cohort_groups = [[[0,40]],[[40,60]],[[60,80]],[[80,int(max(df_merged_features['lifetime']))]]]
np.save(unique_user_saved_outputs+"/dataframes/cohort_groups", cohort_groups)

In [None]:
cohort_groups

In [None]:
%%capture
labels = [track_indices_cmeAnalysis_dnm2_negative,track_indices_cmeAnalysis_dnm2_positive]

for cohorts in cohort_groups:

    display_tracks.fit_cohorts(merged_all_valid_tracks, labels, (20,10*len(labels)), (len(labels),1),
                              filename=unique_user_saved_outputs+'/plots/cmeAnalysis_dynamin2_significance_'+str(cohorts)+'.png',
                              cohorts=cohorts)

In [None]:
for cohorts in cohort_groups:

    display(Image(filename=unique_user_saved_outputs+'/plots/cmeAnalysis_dynamin2_significance_'+str(cohorts)+'.png', height=500, width=500))

# fit of cmeAnalysis DNM2+ events to Rayleigh distribution

In [None]:
%%capture

plt.style.use('default')

lifetimes = [] # extract lifetimes of all cmeAnalysis DNM2 positive events

cmeAnalysisDNM2positive_tracks = np.array(list(merged_all_valid_tracks))[track_indices_cmeAnalysis_dnm2_positive]

for i in range(len(cmeAnalysisDNM2positive_tracks)):

    lifetimes.append(return_track_attributes.return_track_lifetime(cmeAnalysisDNM2positive_tracks,i))

ray = stats.rayleigh # fit lifetimes to a Rayleigh distribution
param = ray.fit(lifetimes)

ks_pvals = stats.kstest(lifetimes,stats.rayleigh(*param).cdf)[1] # measure the goodness-of-fit to Rayleigh distribution

x = np.linspace(np.min(lifetimes), np.max(lifetimes), 100)
pdf_fitted = ray.pdf(x, loc=param[0], scale=param[1])

percentiles = np.linspace(0,100,51)
percentile_lifetime_thresholds = np.percentile(lifetimes, percentiles)
obs_freq, bins = np.histogram(lifetimes, percentile_lifetime_thresholds);
cum_obs_freq = np.cumsum(obs_freq)

cdf_fitted = stats.rayleigh.cdf(percentile_lifetime_thresholds, loc=param[-2], scale=param[-1]);

expected_frequency = []

for bin_ in range(len(percentiles)-1):

    expected_cdf_area = cdf_fitted[bin_+1] - cdf_fitted[bin_]
    expected_frequency.append(expected_cdf_area * len(lifetimes))

cum_expected_frequency = np.cumsum(np.array(expected_frequency))

chi_sq_stat, chi_sq_pval = stats.chisquare(cum_obs_freq, cum_expected_frequency);

obs_freq, bins = np.histogram(lifetimes, percentile_lifetime_thresholds, density=True)
pdf = stats.rayleigh.pdf(bins[1:], *param)
sse = np.sum(np.power(obs_freq - pdf, 2.0))    

plt.figure(figsize=(8,8),dpi=500)

plt.hist(lifetimes, density=1, label='raw lifetimes', bins='auto')

plt.plot(x,pdf_fitted, label='fitted lifetimes')

plt.xlabel('lifetime (s)', fontsize=20)
plt.ylabel('frequency density', fontsize=20)
plt.title('lifetimes of all cmeAnalysis determined DNM2 positive events'+
          '\np-value of Kolmogorov-Smirnov test: ' + str(ks_pvals)+
          '\np-value of Chi-squared test: ' + str(chi_sq_pval)+
          '\nSSE: ' + str(sse), fontsize=10)
plt.savefig(unique_user_saved_outputs+'/plots/cmeAnalyis_dnm2_positive_events_fit_rayleigh.png', bbox_inches='tight')

In [None]:
Image(filename=unique_user_saved_outputs+'/plots/cmeAnalyis_dnm2_positive_events_fit_rayleigh.png', height=500, width=500)

# get indices of tracks in 5 GMM clusters

In [None]:
gmm_class_indices = []

for i in range(number_of_clusters):

    gmm_class_indices.append(df_pcs_normal_scaled_with_gmm_cluster[df_pcs_normal_scaled_with_gmm_cluster['gmm_predictions']==i].index.values)    


cme_positive_total = 0

for i,labels_list in enumerate(gmm_class_indices):

    cme_negative = len(set(labels_list).intersection(track_indices_cmeAnalysis_dnm2_negative))
    cme_positive = len(set(labels_list).intersection(track_indices_cmeAnalysis_dnm2_positive))

    print('the number of members in class {}: {}'.format(i,len(labels_list)))    
    print('the number of members of class {} in dynamin2-negative predicted by cmeAnalysis: {}'.format(i,cme_negative))
    print('the number of members of class {} in dynamin2-positive predicted by cmeAnalysis: {}'.format(i,cme_positive))

# feature comparison between model clusters

In [None]:
%%capture

plt.style.use('default')

num_columns = 5
num_rows = np.ceil(len(feature_units)/num_columns)

plot_position = 1 

f = plt.figure(dpi=500, figsize=(30,30))

df = df_pcs_normal_scaled_with_gmm_cluster


for i in range(len(feature_units)):

    ax = f.add_subplot(num_rows, num_columns, plot_position)
    plot_position+=1
    
    all_feature_values = df_merged_features[df_merged_features.columns[i]].values
    
    for cluster_num in range(number_of_clusters):
        
        cluster_features_indices = gmm_class_indices[cluster_num]
        cluster_features = df_merged_features[df_merged_features.columns[i]].values[cluster_features_indices]

        ax.hist(cluster_features, label=cluster_num, bins=len(cluster_features), density=True, histtype='step', cumulative=True)

    ax.set_xlabel(df_merged_features.columns[i]+' ('+feature_units[i]+')',fontsize=5)
    ax.set_ylabel('cumulative frequency',fontsize=5)
    ax.set_xlim([np.max((np.min(all_feature_values),np.min(all_feature_values))),
                 np.min((np.max(all_feature_values),np.max(all_feature_values)))])
    ax.tick_params(axis='both', which='major', labelsize=3)
    ax.tick_params(axis='both', which='minor', labelsize=3)
    plt.legend(loc='best')
    plt.grid()
    
f.suptitle('track features, comparing GMM components')
f.savefig(unique_user_saved_outputs+'/plots/all_features_compared_between_classes.png', bbox_inches='tight')

In [None]:
Image(filename=unique_user_saved_outputs+'/plots/all_features_compared_between_classes.png', height=500, width=500)

# feature comparison between DNM2+ events between experiments

In [None]:
%%capture

plt.style.use('default')

plot_position = 1 

f = plt.figure(dpi=500, figsize=(30,30))

df = df_pcs_normal_scaled_with_gmm_cluster
colors = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'magenta', 'grey']

indices_dnm2_positive = gmm_class_indices[index_DNM2positive]
for i in range(len(feature_units)):

    ax = f.add_subplot(num_rows, num_columns, plot_position)
    plot_position+=1
    
    all_feature_values = df_merged_features[df_merged_features.columns[i]].values
    
    for exp_number in range(0,len(set(df_merged_features['experiment_number']))):
        
        indices_experiment = np.where(df_merged_features['experiment_number'].values==exp_number)[0]
        indices_dnm2_positive_experiment = np.array(list(set(indices_dnm2_positive).intersection(set(indices_experiment))))
        features_in_experiment = all_feature_values[indices_dnm2_positive_experiment]

        ax.hist(features_in_experiment, label=exp_number, bins='auto', density=True, histtype='step', cumulative=True, color=colors[exp_number])

    ax.set_xlabel(df_merged_features.columns[i]+' ('+feature_units[i]+')',fontsize=5)
    ax.set_ylabel('cumulative frequency',fontsize=5)
    ax.set_xlim([np.max((np.min(all_feature_values),np.min(all_feature_values))),
                 np.min((np.max(all_feature_values),np.max(all_feature_values)))])
    ax.tick_params(axis='both', which='major', labelsize=3)
    ax.tick_params(axis='both', which='minor', labelsize=3)
    plt.legend(loc='best')
    plt.grid()
    
f.suptitle('track features, comparing DNM2+ across experiments')
f.savefig(unique_user_saved_outputs+'/plots/all_features_comparing_dnm2_pos_across_experiments.png', bbox_inches='tight')

In [None]:
Image(filename=unique_user_saved_outputs+'/plots/all_features_comparing_dnm2_pos_across_experiments.png', height=500, width=500)

# feature comparison between model clusters for members that overlap with cmeAnalysis' DNM2 positive prediction

In [None]:
%%capture

plt.style.use('default')

plot_position = 1 

f = plt.figure(dpi=500, figsize=(30,30))

df = df_pcs_normal_scaled_with_gmm_cluster


for i in range(len(feature_units)):

    ax = f.add_subplot(num_rows, num_columns, plot_position)
    plot_position+=1
    
    all_feature_values = df_merged_features[df_merged_features.columns[i]].values
    
    for cluster_num in range(number_of_clusters):
        
        labels_list = gmm_class_indices[cluster_num]
        cme_positive_overlap = list(set(labels_list).intersection(track_indices_cmeAnalysis_dnm2_positive))
        cluster_features = df_merged_features[df_merged_features.columns[i]].values[cme_positive_overlap]

        ax.hist(cluster_features, label=cluster_num, bins=len(cluster_features), density=True, histtype='step', cumulative=True)

    ax.set_xlabel(df_merged_features.columns[i]+' ('+feature_units[i]+')',fontsize=5)
    ax.set_ylabel('cumulative frequency',fontsize=5)
    ax.set_xlim([np.max((np.min(all_feature_values),np.min(all_feature_values))),
                 np.min((np.max(all_feature_values),np.max(all_feature_values)))])
    ax.tick_params(axis='both', which='major', labelsize=3)
    ax.tick_params(axis='both', which='minor', labelsize=3)
    plt.legend(loc='best')
    plt.grid()
    
f.suptitle('track features, comparing GMM components')
f.savefig(unique_user_saved_outputs+'/plots/all_features_compared_between_classes_overlap_cmeAnalysis_dnm2_positive.png', bbox_inches='tight')

In [None]:
Image(filename=unique_user_saved_outputs+'/plots/all_features_compared_between_classes_overlap_cmeAnalysis_dnm2_positive.png', height=500, width=500)

# feature comparison between model clusters, separating DNM2+ cluster and rest

In [None]:
%%capture

plt.style.use('default')

plot_position = 1 

f = plt.figure(dpi=500, figsize=(30,30))

df = df_pcs_normal_scaled_with_gmm_cluster

clusters = list(range(number_of_clusters))
clusters.pop(index_DNM2positive)
random_not_dnm2positive_cluster = random.choice(clusters)

for i in range(len(feature_units)):

    ax = f.add_subplot(num_rows, num_columns, plot_position)
    plot_position+=1
    
    all_feature_values = df_merged_features[df_merged_features.columns[i]].values
    
    for cluster_num in range(number_of_clusters):
        
        cluster_features_indices = gmm_class_indices[cluster_num]
        cluster_features = df_merged_features[df_merged_features.columns[i]].values[cluster_features_indices]

        if cluster_num==index_DNM2positive:
            
            ax.hist(cluster_features, bins=len(cluster_features), density=True, histtype='step', cumulative=True, color='black', label='DNM2 positive')
            
        elif cluster_num==random_not_dnm2positive_cluster:
            
            ax.hist(cluster_features, bins=len(cluster_features), density=True, histtype='step', cumulative=True, color='red', label='DNM2 negative')
        
        else:
            
            ax.hist(cluster_features, bins=len(cluster_features), density=True, histtype='step', cumulative=True, color='red')
            
    ax.set_xlabel(df_merged_features.columns[i]+' ('+feature_units[i]+')',fontsize=5)
    ax.set_ylabel('cumulative frequency',fontsize=5)
    ax.set_xlim([np.max((np.min(all_feature_values),np.min(all_feature_values))),
                 np.min((np.max(all_feature_values),np.max(all_feature_values)))])
    ax.tick_params(axis='both', which='major', labelsize=3)
    ax.tick_params(axis='both', which='minor', labelsize=3)
    plt.legend(loc='best')
    plt.grid()
    
f.suptitle('track features, comparing GMM components')
f.savefig(unique_user_saved_outputs+'/plots/all_features_compared_between_classes_highlighting_dnm2positive.png', bbox_inches='tight')

In [None]:
Image(filename=unique_user_saved_outputs+'/plots/all_features_compared_between_classes_highlighting_dnm2positive.png', height=500, width=500)

# cohort plots of GMM class, binned in cohorts defined above

In [None]:
%%capture

plt.style.use('default')

labels = gmm_class_indices

for cohorts in cohort_groups:

    display_tracks.fit_cohorts(merged_all_valid_tracks, labels, (20,10*len(labels)), (len(labels),1),
                               filename=unique_user_saved_outputs+'/plots/gmm_'+str(number_of_clusters)+'_clusters_cohorts_'+str(cohorts)+'.png',
                               cohorts=cohorts)

In [None]:
for cohorts in cohort_groups:
 
    display(Image(filename=unique_user_saved_outputs+'/plots/gmm_'+str(number_of_clusters)+'_clusters_cohorts_'+str(cohorts)+'.png', height=500, width=500))

# cohort plots of GMM class, fix axes for each cohort to compare clusters

In [None]:
%%capture
plt.style.use('default')
upperbounds = [180, 350, 500, 700]

for i in range(len(cohort_groups)):

    display_tracks.fit_cohorts_fixed_axes(merged_all_valid_tracks, gmm_class_indices, upperbounds[i], (20,50), (5,1),
                                   filename=unique_user_saved_outputs+'/plots/gmm_fixed_axis_cohort_'+str(number_of_clusters)+'_clusters_cohorts_'+str(cohort_groups[i])+'.png',
                                   cohorts=cohort_groups[i])

In [None]:
for i in range(len(cohort_groups)):
    
    display(Image(filename=unique_user_saved_outputs+'/plots/gmm_fixed_axis_cohort_'+str(number_of_clusters)+'_clusters_cohorts_'+str(cohort_groups[i])+'.png', height=500, width=500))

# plot examples of random samples from each GMM class

In [None]:
%%capture

plt.style.use('default')

for i in range(len(gmm_class_indices)):

    display_tracks.plot_subplots_of_labels(merged_all_valid_tracks, 
                                           2, 
                                           ['m','g'],
                                           [0, 1],
                                           gmm_class_indices[i], 
                                           int(number_of_clusters), 
                                           50, 
                                           filename=unique_user_saved_outputs+'/plots/gmm_'+str(number_of_clusters)+'_clusters_class_'+str(i)+'.png')

In [None]:
for i in range(len(gmm_class_indices)):
    print('gmm cluster: ' + str(i))
    display(Image(filename=unique_user_saved_outputs+'/plots/gmm_'+str(number_of_clusters)+'_clusters_class_'+str(i)+'.png', height=500, width=500))

# include background significance thresholds

In [None]:
%%capture

plt.style.use('default')

for i in range(len(gmm_class_indices)):

    display_tracks.plot_subplots_of_labels(merged_all_valid_tracks, 
                                           2, 
                                           ['m','g'],
                                           [0, 1],
                                           gmm_class_indices[i], 
                                           int(number_of_clusters), 
                                           50, 
                                           filename=unique_user_saved_outputs+'/plots/gmm_'+str(number_of_clusters)+'_clusters_class_'+str(i)+'_including_background.png',
                                           include_background=True)

In [None]:
for i in range(len(gmm_class_indices)):
    print('gmm cluster: ' + str(i))
    display(Image(filename=unique_user_saved_outputs+'/plots/gmm_'+str(number_of_clusters)+'_clusters_class_'+str(i)+'_including_background.png', height=500, width=500))

# plot examples of random samples from each GMM class that also overlaps with DNM2 positive from cmeAnalysis

In [None]:
%%capture

plt.style.use('default')

for i,labels_list in enumerate(gmm_class_indices):

    cme_positive_overlap = list(set(labels_list).intersection(track_indices_cmeAnalysis_dnm2_positive))
    
    display_tracks.plot_subplots_of_labels(merged_all_valid_tracks, 
                                           2, 
                                           ['m', 'g'],
                                           [0, 1],
                                           cme_positive_overlap, 
                                           int(number_of_clusters), 
                                           50, 
                                           filename=unique_user_saved_outputs+'/plots/gmm_'+str(number_of_clusters)+'_clusters_overlap_cmeDNM2positive_class_'+str(i)+'.png')

In [None]:
for i in range(len(gmm_class_indices)):
    print('gmm cluster: ' + str(i))
    display(Image(filename=unique_user_saved_outputs+'/plots/gmm_'+str(number_of_clusters)+'_clusters_overlap_cmeDNM2positive_class_'+str(i)+'.png', height=500, width=500))

# classifier between all classes using principal components

In [None]:
labels = df_pcs_normal_scaled_with_gmm_cluster['gmm_predictions']
values = df_pcs_normal_scaled_with_gmm_cluster.values[:,:2]

X_train, X_test, y_train, y_test = train_test_split(values, labels, random_state=713, test_size=0.5)
clf = SVC()
clf.fit(X_train, y_train)

print("Accuracy on test data: {:.4f}".format(clf.score(X_test, y_test)))

fig, axes = plt.subplots(dpi=500)
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

cm_display = ConfusionMatrixDisplay(cm).plot(ax=axes)
plt.show()
indices_true_dnm2 = np.where(y_test==index_DNM2positive)[0]

indices_pred_other = np.where(y_pred!=0)[0]

indices_false_other_true_dnm2 = list(set(indices_true_dnm2) & set(indices_pred_other))

indices_true_class_other = np.where(y_test!=0)[0]

indices_pred_class_0 = np.where(y_pred==0)[0]

indices_false_dnm2_true_class_other = list(set(indices_true_class_other) & set(indices_pred_class_0))
plt.style.use('default')
plt.figure(dpi=500, figsize=(5,5))
plt.scatter(df_pcs_normal_scaled_with_gmm_cluster['PC-0'],
            df_pcs_normal_scaled_with_gmm_cluster['PC-1'], 
            alpha=0.1, 
            s=0.5, 
            c='blue')
plt.scatter(X_test[:,0][indices_false_other_true_dnm2],
            X_test[:,1][indices_false_other_true_dnm2], 
            alpha=1, 
            s=3, 
            c='pink',
            label='DNM2+ false negative')
plt.scatter(X_test[:,0][indices_false_dnm2_true_class_other],
            X_test[:,1][indices_false_dnm2_true_class_other], 
            alpha=1, 
            s=3, 
            c='red',
            label='DNM2+ false positive')
plt.xlabel('PC-0')
plt.ylabel('PC-1')
plt.ylim([-7, 11])
plt.xlim([-9, 13])
plt.xticks([-5, 0, 5, 10], labels=[-5, 0, 5, 10])
plt.yticks([-5, 0, 5, 10], labels=[-5, 0, 5, 10])
plt.legend()
plt.title('principal components of valid tracks\nmapped to feature-space')
plt.tight_layout()

# classifier between all classes using raw features

# with SVC

In [None]:
labels = df_pcs_normal_scaled_with_gmm_cluster['gmm_predictions']
values = df_merged_features.values[:,:len(feature_units)]
values = preprocessing.StandardScaler().fit_transform(values)
X_train, X_test, y_train, y_test = train_test_split(values, labels, random_state=713, test_size=0.2)
clf = SVC()
clf.fit(X_train, y_train)

print("Accuracy on test data: {:.4f}".format(clf.score(X_test, y_test)))

fig, axes = plt.subplots(dpi=500)
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

cm_display = ConfusionMatrixDisplay(cm).plot(ax=axes)

# with Random Forest

In [None]:
labels = df_pcs_normal_scaled_with_gmm_cluster['gmm_predictions']
values = df_merged_features.values[:,:len(feature_units)]
values = preprocessing.StandardScaler().fit_transform(values)
X_train, X_test, y_train, y_test = train_test_split(values, labels, random_state=713, test_size=0.2)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

print("Accuracy on test data: {:.4f}".format(clf.score(X_test, y_test)))

fig, axes = plt.subplots(dpi=500)
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

cm_display = ConfusionMatrixDisplay(cm).plot(ax=axes)

# classifier between all classes using raw features and zero-ing out DNM2 features

# with SVC

In [None]:
labels = df_pcs_normal_scaled_with_gmm_cluster['gmm_predictions']
values = df_merged_features.values[:,:len(feature_units)]
feature_names = df_merged_features.columns.values
for i in range(len(feature_units)):
    if 'dnm2' in feature_names[i]:
        values[:,i] = 0
values = preprocessing.StandardScaler().fit_transform(values)
X_train, X_test, y_train, y_test = train_test_split(values, labels, random_state=713, test_size=0.5)
clf = SVC()
clf.fit(X_train, y_train)

print("Accuracy on test data: {:.4f}".format(clf.score(X_test, y_test)))

fig, axes = plt.subplots(dpi=500)
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

cm_display = ConfusionMatrixDisplay(cm).plot(ax=axes)

# with Random Forest

In [None]:
labels = df_pcs_normal_scaled_with_gmm_cluster['gmm_predictions']
values = df_merged_features.values[:,:len(feature_units)]
feature_names = df_merged_features.columns.values
for i in range(len(feature_units)):
    if 'dnm2' in feature_names[i]:
        values[:,i] = 0
X_train, X_test, y_train, y_test = train_test_split(values, labels, random_state=713, test_size=0.5)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

print("Accuracy on test data: {:.4f}".format(clf.score(X_test, y_test)))

fig, axes = plt.subplots(dpi=500)
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

cm_display = ConfusionMatrixDisplay(cm).plot(ax=axes)

# classifier between all classes using raw features and zero-ing out AP2 features

# with SVC

In [None]:
from sklearn.model_selection import ShuffleSplit

In [None]:
labels = df_pcs_normal_scaled_with_gmm_cluster['gmm_predictions']
values = df_merged_features.values[:,:len(feature_units)]
feature_names = df_merged_features.columns.values
for i in range(len(feature_units)):
    if 'ap2' in feature_names[i]:
        values[:,i] = 0
values = preprocessing.StandardScaler().fit_transform(values)
X_train, X_test, y_train, y_test = train_test_split(values, labels, random_state=713, test_size=0.5)
clf = SVC()
clf.fit(X_train, y_train)

print("Accuracy on test data: {:.4f}".format(clf.score(X_test, y_test)))

fig, axes = plt.subplots(dpi=500)
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

cm_display = ConfusionMatrixDisplay(cm).plot(ax=axes)

# with Random Forest

In [None]:
labels = df_pcs_normal_scaled_with_gmm_cluster['gmm_predictions']
values = df_merged_features.values[:,:len(feature_units)]
feature_names = df_merged_features.columns.values
for i in range(len(feature_units)):
    if 'ap2' in feature_names[i]:
        values[:,i] = np.random.rand(values[:,i].shape[0])
X_train, X_test, y_train, y_test = train_test_split(values, labels, random_state=713, test_size=0.5)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

print("Accuracy on test data: {:.4f}".format(clf.score(X_test, y_test)))

fig, axes = plt.subplots(dpi=500)
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

cm_display = ConfusionMatrixDisplay(cm).plot(ax=axes)

# classifier between DNM2+ and other classes using raw features and zero-ing out DNM2 features

# with SVC

In [None]:
import numpy as np
from sklearn.model_selection import ShuffleSplit
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]])
y = np.array([1, 2, 1, 2, 1, 2])
rs = ShuffleSplit(n_splits=1, test_size=.25, random_state=0)
print(rs)
rs.get_n_splits(X)
print(rs)
# print(rs)

for train_index, test_index in rs.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)





# rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25,
#                   random_state=0)
# for train_index, test_index in rs.split(X):
#     print("TRAIN:", train_index, "TEST:", test_index)

In [None]:
len(feature_units)

In [None]:
labels_temp = df_pcs_normal_scaled_with_gmm_cluster['gmm_predictions']
labels=[]
for i in range(len(labels_temp)):
    if labels_temp[i]==index_DNM2positive:
        labels.append(1)
    else:
        labels.append(0)
values = df_merged_features.values[:,:len(feature_units)]
feature_names = df_merged_features.columns.values
for i in range(len(feature_units)):
    if 'dnm2' in feature_names[i]:
        values[:,i] = 0
values = preprocessing.StandardScaler().fit_transform(values)
rs = ShuffleSplit(n_splits=5, test_size=.5)
rs.get_n_splits(values)
labels = np.array(labels)
for train_idx, test_idx in rs.split(values):
    X_train, X_test, y_train, y_test = values[train_idx], values[test_idx], labels[train_idx], labels[test_idx]
    clf = SVC()
    clf.fit(X_train, y_train)

    print("Accuracy on test data: {:.4f}".format(clf.score(X_test, y_test)))

# with Random Forest

In [None]:
labels_temp = df_pcs_normal_scaled_with_gmm_cluster['gmm_predictions']
labels=[]
for i in range(len(labels_temp)):
    if labels_temp[i]==index_DNM2positive:
        labels.append(1)
    else:
        labels.append(0)
values = df_merged_features.values[:,:len(feature_units)]
feature_names = df_merged_features.columns.values
for i in range(len(feature_units)):
    if 'dnm2' in feature_names[i]:
        values[:,i] = 0
values = preprocessing.StandardScaler().fit_transform(values)
rs = ShuffleSplit(n_splits=5, test_size=.5)
rs.get_n_splits(values)
labels = np.array(labels)
for train_idx, test_idx in rs.split(values):
    X_train, X_test, y_train, y_test = values[train_idx], values[test_idx], labels[train_idx], labels[test_idx]
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)

    print("Accuracy on test data: {:.4f}".format(clf.score(X_test, y_test)))

# classifier between DNM2+ and other classes using raw features and zero-ing out AP2 features

# with SVC

In [None]:
labels_temp = df_pcs_normal_scaled_with_gmm_cluster['gmm_predictions']
labels=[]
for i in range(len(labels_temp)):
    if labels_temp[i]==index_DNM2positive:
        labels.append(1)
    else:
        labels.append(0)
values = df_merged_features.values[:,:len(feature_units)]
feature_names = df_merged_features.columns.values
for i in range(len(feature_units)):
    if 'ap2' in feature_names[i]:
        values[:,i] = 0
values = preprocessing.StandardScaler().fit_transform(values)
rs = ShuffleSplit(n_splits=5, test_size=.5)
rs.get_n_splits(values)
labels = np.array(labels)
for train_idx, test_idx in rs.split(values):
    X_train, X_test, y_train, y_test = values[train_idx], values[test_idx], labels[train_idx], labels[test_idx]
    clf = SVC()
    clf.fit(X_train, y_train)

    print("Accuracy on test data: {:.4f}".format(clf.score(X_test, y_test)))

# with Random Forest

In [None]:
labels_temp = df_pcs_normal_scaled_with_gmm_cluster['gmm_predictions']
labels=[]
for i in range(len(labels_temp)):
    if labels_temp[i]==index_DNM2positive:
        labels.append(1)
    else:
        labels.append(0)
values = df_merged_features.values[:,:len(feature_units)]
feature_names = df_merged_features.columns.values
for i in range(len(feature_units)):
    if 'ap2' in feature_names[i]:
        values[:,i] = 0
values = preprocessing.StandardScaler().fit_transform(values)
rs = ShuffleSplit(n_splits=5, test_size=.5)
rs.get_n_splits(values)
labels = np.array(labels)
for train_idx, test_idx in rs.split(values):
    X_train, X_test, y_train, y_test = values[train_idx], values[test_idx], labels[train_idx], labels[test_idx]
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)

    print("Accuracy on test data: {:.4f}".format(clf.score(X_test, y_test)))