In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import stumpy
import glob
import ipywidgets as widgets
import peakutils
from tslearn.clustering import KShape
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.clustering import KernelKMeans
from tslearn.clustering import TimeSeriesKMeans
from scipy.signal import savgol_filter
from functools import partial
from joblib import delayed, Parallel
%matplotlib widget

In [None]:
path = "/share/data/temp/athira/July17_features_combined_noLightStimuli.pickle"
df = pd.read_pickle(path)

In [None]:
def get_curvature_data(f):
    """
    function to extract a specific timeseries from the main dataframe.
    """
    df_f = df[df["filename"] == f]
    cols_to_return = [c for c in df.columns if "curv" in c]
    return df_f[cols_to_return]

In [None]:
def do_work(filename, window = 150):
    try:
        data = get_curvature_data(filename)
        for c in data.columns:
            data[c] = data[c].rolling(10).mean()
        data = data[10:]
        mp, ind = stumpy.mstump(data, m = window)
        peaks = peakutils.indexes(1-mp[:,-1], min_dist = window, thres = 0.8)
        motifs = np.stack([data.values[peak:peak+window, :] for peak in peaks])
        return [filename, motifs]
    except Exception as e:
        return( (filename, str(e)))

In [None]:
to_do = df["filename"].unique()

In [None]:
window = 150 #or 30 for window=1sec

#20 threads use up about 90gb of RAM. Don`t use much more.
motifs = Parallel(n_jobs=20, verbose = 5)(delayed(partial(do_work, window = window))(f) for f in to_do)


In [None]:
true_motifs = [m for m in motifs if type(m) == list]

In [None]:
len(true_motifs)

In [None]:
with open("motifs.pkl", "wb") as p:
    pickle.dump(true_motifs, p)

In [None]:
motifs = true_motifs
true_motifs = None

In [None]:
motifs[0][1].shape

In [None]:
m = motifs[0]

In [None]:
motif_df = pd.concat([pd.DataFrame([[m[0], x] for x in m[1]], columns = ["filename", "motifs"]) for m in motifs], ignore_index = True)

In [None]:
motifs_scaled = TimeSeriesScalerMeanVariance().fit_transform(np.stack(motif_df["motifs"].values))

In [None]:
model = TimeSeriesKMeans(n_jobs = 30, n_clusters = 15, metric = "dtw")
model.fit(motifs_scaled)

In [None]:
model.labels_.shape

In [None]:
motif_df["labels"] = model.labels_

In [None]:
motif_df.to_hdf("motifs_150_meanvarnorm_rollingmean_labeled.hdf5", key = "data")

In [None]:
def find_drug(filename):
    return df["drug"][df["filename"] == filename].values[0]

In [None]:
import tqdm

In [None]:
drugs_col = []
for f in tqdm.tqdm(motif_df["filename"]):
    drugs_col.append(df["drug"][df["filename"] == f].values[0])
    

In [None]:
motif_df["drugs"] = drugs_col

In [None]:
motif_df.to_hdf("motifs_150_meanvarnorm_rollingmean_labeled.hdf5", key = "data")

In [None]:
motif_df = pd.read_hdf("motifs_150_meanvarnorm_rollingmean_labeled.hdf5")

In [None]:
motif_df.columns

In [None]:
perdrug = pd.DataFrame()
for drug in np.sort(motif_df.drugs.unique()):
    sub = motif_df[motif_df["drugs"] == drug]
    gb = sub.groupby("labels").count()
    to_add = list(gb["drugs"].values)
    if len(to_add) != 15:
        to_add.append(0)
    perdrug[drug] = to_add
        
    

In [None]:
perdrug_percentage = (perdrug / perdrug.sum(axis = 0))*100

In [None]:
plt.close("all")

In [None]:
fig, ax = plt.subplots(figsize = (8, 4), constrained_layout = True)
perdrug_percentage.T.plot(kind = "bar", stacked = True, cmap = "tab20", ax = ax)
legend = ax.legend(bbox_to_anchor = (1., 1.), ncol = 2)
ax.set_title("window_150_rollingmean_15clusters")
ax.set_ylabel("Percentage represented")
fig.savefig("percentages_window150_rollingmean_15clusters.png", dpi = 300)

In [None]:
fig, ax = plt.subplots(figsize = (8, 4), constrained_layout = True)
perdrug.T.plot(kind = "bar", stacked = True, cmap = "tab20", ax = ax)
legend = ax.legend(bbox_to_anchor = (1., 1.), ncol = 2)
ax.set_title("window_150_rollingmean_15clusters")
ax.set_ylabel("Percentage represented")
fig.savefig("absolute_window150_rollingmean_15clusters.png", dpi = 300)

In [None]:
fig, ax = plt.subplots(figsize = (8, 4), constrained_layout = True)
perdrug[[c for c in perdrug.columns if c.lower() != "none"]].T.plot(kind = "bar", stacked = True, cmap = "tab20", ax = ax)
legend = ax.legend(bbox_to_anchor = (1., 1.), ncol = 2)
ax.set_title("window_150_rollingmean_15clusters")
ax.set_ylabel("Percentage represented")
fig.savefig("absolute_nonone_window150_rollingmean_15clusters.png", dpi = 300)

In [None]:
import seaborn as sns

In [None]:
fig, ax = plt.subplots()
bars = sns.barplot(x = "drugs", y = "labels", data = motif_df, stacked = True)

In [None]:
drugs_col = Parallel(n_jobs = 30, verbose = 5)(delayed(find_drug)(f) for f in motif_df["filename"])

In [None]:
len(drugs_col)

In [None]:
motif_df["drugs"] = drugs_col