In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import LabelBinarizer

In [9]:
## This function will return the silhouette score for each feature/column
## Only columns with variance > variance_threshold will be considered
def feature_silhouette(X_train, y_train, variance_threshold=0.1, shuffle_data=True):

    # select columns with  variance > variance_threshold
    variable_features = (np.var(X_train) > variance_threshold)
    X_train = X_train.loc[:,variable_features]

    # shuffle our data
    if shuffle_data:
        X_train, y_train = shuffle(X_train, y_train, random_state=0)
    
    # retrieve the 46 types of cells
    type_counts = y_train.value_counts()
    num_types = len(type_counts)
    cell_types = type_counts.index

    # generate centroids for each feature (using y_train cell types as clusters)
    centroids = pd.DataFrame(0, index=X_train.index, columns=X_train.columns)#X_train
    sil_scores = pd.DataFrame(-1.0, index=X_train.columns, columns=["silhouette_score"])
    for cell_type in cell_types:
        index = y_train==cell_type
        centroids.loc[index, :] = np.average(X_train.loc[index, :], axis=0)
    
    for i, col in zip(range(len(X_train.columns)), X_train.columns):
        if i%500 == 0:
            print(f'{i:5d} columns calculated')
        X = X_train.loc[:,col].values.reshape(-1,1)
        labels = np.ravel(centroids.loc[:,col].values)
        sil_scores.loc[col] = silhouette_score(X, labels)
    
    sil_scores.to_csv(f'./feature_silhouette_scores(var_{variance_threshold}).csv')
    return sil_scores, variable_features

In [10]:
# open up a train datastore
train = pd.HDFStore('train_data.h5')
y_train = train['labels']
X_train = train['rpkm']
idx1 = y_train.index
X_train = X_train.loc[idx1, :]

In [11]:
sil_scores, car_cols = feature_silhouette(X_train, y_train, variance_threshold=0.01, shuffle_data=True)

    0 columns calculated
  500 columns calculated
 1000 columns calculated
 1500 columns calculated


In [12]:
df = pd.DataFrame()

In [13]:
df['genes'] = car_cols
df['scores'] = sil_scores

In [14]:
df.to_csv('feature_silhouette_scores.txt')