In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import scale

plt.rcParams['pdf.fonttype'] = 42

In [None]:
# load data
df_agg = pd.read_csv('/scratch/hoerl/replication-data/df_agg.csv')
df_agg = df_agg.sort_values(by=['stage'])
df_agg.head()

**Alternative:** aggregate raw features (+ volume threshold)

In [None]:
df_edu = pd.read_csv('/scratch/hoerl/replication-data/df_edu.csv', index_col=0)
df_edu = df_edu[df_edu.edu_area > 200] # should correspond roughly to PSF size on confocal

# make DataFrame
def quantile75(d):
    return np.quantile(d, 0.75)
def quantile25(d):
    return np.quantile(d, 0.25)

agg_funs = ['median', 'count', 'std']
#agg_funs = ['median', 'mean', 'count', 'std', quantile25, quantile75]

# calculate median, stddev, count for all features per cell
df_agg_edu = df_edu.groupby(['file', 'stage', 'chase_dur']).agg(agg_funs)
df_agg_edu.columns = df_agg_edu.columns.to_flat_index() # collapse multi-index
df_agg_edu.columns = ['_'.join(c) for c in df_agg_edu.columns]
df_agg_edu['edu_num_sites'] = df_agg_edu['edu_area_count'] # copy one count column, call it num_sites
df_agg_edu = df_agg_edu.drop([c for c in df_agg_edu.columns if c.endswith('count')], 1) # remove all redundant count columns
df_agg = df_agg_edu.reset_index()
df_agg = df_agg.sort_values(by=['stage'])
df_agg.head()

In [None]:
# only use one label features for now
df_agg = df_agg.iloc[:, [not c.startswith('pcna') for c in df_agg.columns]]
df_agg = df_agg.iloc[:, [not 'edu_pcna' in c for c in df_agg.columns]]
df_agg = df_agg.iloc[:, [not 'dist_pcna' in c for c in df_agg.columns]]

# ignore label columns (not informative)
df_agg = df_agg.iloc[:, [not 'label' in c for c in df_agg.columns]]

# ignore bbox columns (also covered by extent)
df_agg = df_agg.iloc[:, [not 'bbox' in c for c in df_agg.columns]]

# ignore "convex" columns (also covered by solidiy)
df_agg = df_agg.iloc[:, [not 'convex' in c for c in df_agg.columns]]

# ignore "filled area" columns (is the same as area except for 1/2 samples)
df_agg = df_agg.iloc[:, [not 'filled_area' in c for c in df_agg.columns]]

# ignore "euler number" columns
df_agg = df_agg.iloc[:, [not 'euler' in c for c in df_agg.columns]]

# ignore "equivalent diameter" columns
#df_agg = df_agg.iloc[:, [not 'diameter' in c for c in df_agg.columns]]

# keep only normalized intensities, except for EDT
df_agg = df_agg.iloc[:, [not 'mean' in c or not (c.replace('mean', 'mean_norm') in df_agg.columns) for c in df_agg.columns]]

# ignore some nn columns
# nn_to_ignore = ['1', '3']
# for nn in nn_to_ignore:
#     df_agg = df_agg.iloc[:, [not nn in c for c in df_agg.columns]]
    
len(df_agg.columns), df_agg.columns

In [None]:
# modified from calmutils.localization.util

def get_ellipse_params(cov, n_sdev=1):
    """
    get ellipse parameters for matplotlib for an ellipse representing the
    full-width-at-quantile of a 2d-Gaussian

    Parameters
    ----------
    cov: np-array
        2x2 covaraince matrix
    n_sdev: float
        number of s.d.s at which to draw ellipse

    Returns
    -------
    a: float
        horizontal axis length
    b: float
        vertical axis length
    alpha: float \in (-180, 180)
        counterclockwise rotation of the ellipse in degrees
    """
    w, v = np.linalg.eig(cov)
    lens = [np.sqrt(wi) * 2 * n_sdev for wi in w]
    a = _deg_angle(v[:, 0] * lens[0])
    return (lens[0], lens[1], a)


def _deg_angle(a):
    """
    angle between vector a and x-axis, in degrees
    """
    return 180 * np.arctan2(a[1], a[0]) / np.pi

In [None]:
from matplotlib.patches import Ellipse

xs = df_agg.iloc[:,2:].values
ys = df_agg['stage'].values
feature_names = np.array(df_agg.iloc[:,2:].columns)

# make numeric labels
ys_num = LabelEncoder().fit_transform(ys)

# impute missing values
xs_imputed = SimpleImputer().fit_transform(xs)
xs_imputed = scale(xs_imputed)

# circular color palette
cmap = sns.color_palette("husl", 7)[0:5]


# how often to repeat tSNE
n_replicates = 1
for r in np.random.randint(0, 25000, n_replicates):
    ts = TSNE(n_iter=2000, perplexity=50, random_state=r).fit_transform(xs_imputed)
    #ts = PCA().fit_transform(xs_imputed)
    plt.figure(figsize=(7,7))

    sns.scatterplot(ts.T[0], ts.T[1], hue = ys, s=100, palette=cmap)

    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.title('t-SNE visualization')

    # draw ellipses around each class
    for i, yi in enumerate(np.unique(ys_num)):
        t = ts[ys_num==yi]
        a, b, alpha = get_ellipse_params(np.cov(t, rowvar=False), 1.5)
        ell = Ellipse(np.mean(t, axis=0), a, b, alpha, fill=None, color=cmap[i], ls='dashed', lw=2)
        plt.gca().add_artist(ell)

#     plt.savefig('/scratch/hoerl/replication-data/tsne_replicates/df_agg_tsne{}.pdf'.format(r))

In [None]:
# train classifier on features

#clf = RandomForestClassifier(n_estimators=10)
clf = LogisticRegression()
#clf = DecisionTreeClassifier()

cv = cross_val_score(clf, xs_imputed, ys_num)
print('Accuracy:', np.mean(cv), '\tReplicates:', cv)

def one_off_scorer(estimator, X, y):
    return np.sum(np.abs(estimator.predict(X) - y) <= 1) / len(y)

cv = cross_val_score(clf, xs_imputed, ys_num, scoring=one_off_scorer)
print('Accuracy (One-off):', np.mean(cv), '\tReplicates:', cv)

get feature importance following [https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance_multicollinear.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-multicollinear-py]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(xs_imputed, ys_num)

clf = LogisticRegression()
clf.fit(xs_imputed, ys_num)


result = permutation_importance(clf, xs_imputed, ys_num, n_repeats=25)
perm_sorted_idx = result.importances_mean.argsort()

plt.figure(figsize=(6,10))
plt.title('Feature Importance')
plt.boxplot(result.importances[perm_sorted_idx].T, vert=False,
            labels=feature_names[perm_sorted_idx])
plt.gcf().tight_layout()
plt.show()


In [None]:
# boxplot some features

plt.figure(figsize=(6,4))
plt.title('Distance from border')
sns.boxplot(data=df_agg.sort_values(by=['stage']), x='stage', y='edu_edt_mean_median')

plt.figure(figsize=(6,4))
plt.title('Solidity')
sns.boxplot(data=df_agg.sort_values(by=['stage']), x='stage', y='edu_solidity_median')

plt.figure(figsize=(6,4))
plt.title('Normalized EdU intensity')
sns.boxplot(data=df_agg.sort_values(by=['stage']), x='stage', y='edu_edu_mean_norm_median')

plt.figure(figsize=(6,4))
plt.title('Number of replication sites')
sns.boxplot(data=df_agg.sort_values(by=['stage']), x='stage', y='edu_num_sites')

plt.figure(figsize=(6,4))
plt.title('Size of EdU sites')
sns.boxplot(data=df_agg.sort_values(by=['stage']), x='stage', y='edu_area_median')

In [None]:
# boxplot all features

import warnings

with warnings.catch_warnings():
    warnings.simplefilter('ignore') # ignore "too many plots" warning
    for feat in list(reversed(feature_names[perm_sorted_idx])):
        plt.figure(figsize=(6,4))
        plt.title(feat)
        sns.boxplot(data=df_agg.sort_values(by=['stage']), x='stage', y=feat, notch=True)