In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing.label import LabelEncoder
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import scale

In [None]:
df_agg = pd.read_csv('C:/Users/david/Desktop/replication-data/df_agg.csv')
df_agg.head()


# only use one label features for now
df_agg = df_agg.iloc[:, [not c.startswith('pcna') for c in df_agg.columns]]
df_agg = df_agg.iloc[:, [not 'edu_pcna' in c for c in df_agg.columns]]
df_agg = df_agg.iloc[:, [not 'dist_pcna' in c for c in df_agg.columns]]

# ignore label columns (not informative)
df_agg = df_agg.iloc[:, [not 'label' in c for c in df_agg.columns]]

# ignore bbox columns (also covered by extent)
df_agg = df_agg.iloc[:, [not 'bbox' in c for c in df_agg.columns]]

# ignore "convex" columns (also covered by solidiy)
df_agg = df_agg.iloc[:, [not 'convex' in c for c in df_agg.columns]]

# ignore "filled area" columns (is the same as area except for 1/2 samples)
df_agg = df_agg.iloc[:, [not 'filled_area' in c for c in df_agg.columns]]

# ignore "major/minor axis" columns
#df_agg = df_agg.iloc[:, [not 'axis' in c for c in df_agg.columns]]

# ignore "equivalent diameter" columns
#df_agg = df_agg.iloc[:, [not 'diameter' in c for c in df_agg.columns]]

# keep only normalized intensities, except for EDT
df_agg = df_agg.iloc[:, [not 'mean' in c or not (c.replace('mean', 'mean_norm') in df_agg.columns) for c in df_agg.columns]]

# ignore some nn columns
# nn_to_ignore = ['1', '3']
# for nn in nn_to_ignore:
#     df_agg = df_agg.iloc[:, [not nn in c for c in df_agg.columns]]
    
df_agg.columns

In [None]:
xs = df_agg.iloc[:,2:].values
ys = df_agg['stage'].values
feature_names = np.array(df_agg.iloc[:,2:].columns)

# make numeric labels
ys_num = LabelEncoder().fit_transform(ys)

# impute missing values
xs_imputed = SimpleImputer().fit_transform(xs)
xs_imputed = scale(xs_imputed)

ts = TSNE(perplexity=20).fit_transform(xs_imputed)
plt.figure(figsize=(7,7))
sns.scatterplot(ts.T[0], ts.T[1], hue = ys)

plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.title('t-SNE visualization')

In [None]:
#clf = RandomForestClassifier(n_estimators=10)
clf = LogisticRegression()
#clf = DecisionTreeClassifier()

cv = cross_val_score(clf, xs_imputed, ys_num)
print('Accuracy:', np.mean(cv), '\tReplicates:', cv)

def one_off_scorer(estimator, X, y):
    return np.sum(np.abs(estimator.predict(X) - y) <= 1) / len(y)

cv = cross_val_score(clf, xs_imputed, ys_num, scoring=one_off_scorer)
print('Accuracy (One-off):', np.mean(cv), '\tReplicates:', cv)

following [https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance_multicollinear.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-multicollinear-py]

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(xs_imputed, ys_num)

clf = LogisticRegression()
clf.fit(xs_imputed, ys_num)


result = permutation_importance(clf, xs_imputed, ys_num, n_repeats=25)
perm_sorted_idx = result.importances_mean.argsort()

plt.figure(figsize=(6,10))
plt.title('Feature Importance')
plt.boxplot(result.importances[perm_sorted_idx].T, vert=False,
            labels=feature_names[perm_sorted_idx])
plt.gcf().tight_layout()
plt.show()


In [None]:
list(reversed(feature_names[perm_sorted_idx]))

In [None]:
plt.figure(figsize=(6,4))
plt.title('Distance from border')
sns.boxplot(data=df_agg.sort_values(by=['stage']), x='stage', y='edu_edt_mean_median')

plt.figure(figsize=(6,4))
plt.title('Solidity')
sns.boxplot(data=df_agg.sort_values(by=['stage']), x='stage', y='edu_solidity_median')

plt.figure(figsize=(6,4))
plt.title('Normalized EdU intensity')
sns.boxplot(data=df_agg.sort_values(by=['stage']), x='stage', y='edu_edu_mean_norm_median')

plt.figure(figsize=(6,4))
plt.title('Number of replication sites')
sns.boxplot(data=df_agg.sort_values(by=['stage']), x='stage', y='edu_num_sites')

plt.figure(figsize=(6,4))
plt.title('Size of EdU sites')
sns.boxplot(data=df_agg.sort_values(by=['stage']), x='stage', y='edu_area_median')