In [1]:
import numpy as np
import pandas as pd
import ms_feature_validation as mfv
import bokeh.plotting
bokeh.plotting.output_notebook()

In [2]:
data = mfv.fileio.read_progenesis("progenesis_data_matrix_20190918.csv")

# adding order and batch information
temp = pd.Series(data=data.sample_metadata.index.str.split("_"),
                 index=data.sample_metadata.index)
order = temp.apply(lambda x: x[1]).astype(int)
dates = temp.apply(lambda x: x[0])
dates_to_batch = dict(zip(dates.unique(), range(1, dates.size + 1)))
batch = (temp.apply(lambda x: dates_to_batch[x[0]])).astype(int)

def convert_to_global_run_order(order, batch):
    max_order = order.groupby(batch).max()
    max_order[0] = 0
    global_run_order = order + batch.apply(lambda x: max_order[x - 1])
    return global_run_order

data.order = convert_to_global_run_order(order, batch)
data.batch = batch
data.id = data.sample_metadata.index

# setup sample types
sample_mapping = {"qc": ["QC d2 v1", "QC d2 v2", "QC d1 v1", "QC d1 v2"],
                 "suitability": ["standards mixture"],
                 "blank": ["solvent blank", "Solvent"],
                 "zero": ["Zero"]}
data.mapping = sample_mapping

In [5]:
data.plot.pca_scores(show_order=True);

In [6]:
data.plot.pca_loadings()

In [7]:
trp = data.select_features(203.0821, 128)[0]
data.plot.feature(trp, color_by="type", scatter_params={"size": 10})

In [9]:
%%time
# generate some synthetic time series for six different categories
cats = list("abcdef")
yy = np.random.randn(2000)
g = np.random.choice(cats, 2000)
for i, l in enumerate(cats):
    yy[g == l] += i // 2
df = pd.DataFrame(dict(score=yy, group=g))

# find the quartiles and IQR for each category
groups = df.groupby('group')
q1 = groups.quantile(q=0.25)
q2 = groups.quantile(q=0.5)
q3 = groups.quantile(q=0.75)
iqr = q3 - q1
upper = q3 + 1.5*iqr
lower = q1 - 1.5*iqr

# find the outliers for each category
def outliers(group):
    cat = group.name
    return group[(group.score > upper.loc[cat]['score']) | (group.score < lower.loc[cat]['score'])]['score']
out = groups.apply(outliers).dropna()

# prepare outlier data for plotting, we need coordinates for every outlier.
if not out.empty:
    outx = []
    outy = []
    for keys in out.index:
        outx.append(keys[0])
        outy.append(out.loc[keys[0]].loc[keys[1]])

p = bokeh.plotting.figure(background_fill_color="#efefef",
                          x_range=cats)

# if no outliers, shrink lengths of stems to be no longer than the minimums or maximums
qmin = groups.quantile(q=0.00)
qmax = groups.quantile(q=1.00)
upper.score = [min([x,y]) for (x,y) in zip(list(qmax.loc[:,'score']),upper.score)]
lower.score = [max([x,y]) for (x,y) in zip(list(qmin.loc[:,'score']),lower.score)]

# stems
p.segment(cats, upper.score, cats, q3.score, line_color="black")
p.segment(cats, lower.score, cats, q1.score, line_color="black")

# boxes
p.vbar(cats, 0.7, q2.score, q3.score, fill_color="#E08E79", line_color="black")
p.vbar(cats, 0.7, q1.score, q2.score, fill_color="#3B8686", line_color="black")

# whiskers (almost-0 height rects simpler than segments)
p.rect(cats, lower.score, 0.2, 0.01, line_color="black")
p.rect(cats, upper.score, 0.2, 0.01, line_color="black")

# outliers
if not out.empty:
    p.circle(outx, outy, size=6, color="#F38630", fill_alpha=0.6)

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = "white"
p.grid.grid_line_width = 2
p.xaxis.major_label_text_font_size="12pt"
bokeh.plotting.show(p)

CPU times: user 105 ms, sys: 0 ns, total: 105 ms
Wall time: 106 ms


In [8]:
data.feature_metadata

Unnamed: 0_level_0,Neutral mass (Da),mz,Charge,rt,Chromatographic peak width (min),Identifications,Max Fold Change,Highest Mean,Lowest Mean,Isotope Distribution,Maximum Abundance,Minimum CV%
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
7.07_663.1141m/z,,663.114121,1,423.948,0.101483,0,Infinity,QC d1 v2,Zero,100 - 19.2,20.885235,79.47
7.07_514.1940m/z,,514.193963,1,423.948,0.050733,0,Infinity,QC d1 v1,Zero,100 - 69.2,21.568750,24.70
7.07_677.3182m/z,,677.318163,1,423.948,0.084567,0,Infinity,QC d1 v1,Zero,100 - 27,38.728473,11.85
7.07_849.5030m/z,,849.503049,1,423.948,0.067650,0,Infinity,QC d1 v2,Zero,100 - 12.4,23.914494,20.65
7.07_337.1398m/z,,337.139812,1,423.948,0.118400,0,Infinity,QC d2 v1,Zero,100,82.486405,6.46
...,...,...,...,...,...,...,...,...,...,...,...,...
14.99_180.8400m/z,,180.840037,1,899.319,4.077067,0,1.83838830582109,QC d2 v1,QC d1 v1,100,1530.375243,2.93
14.64_162.9211m/z,,162.921079,1,878.336,0.270600,0,1.81145489822758,QC d2 v1,QC d1 v1,100 - 3.97,247.951621,3.89
15.38_901.5212m/z,,901.521230,2,922.659,0.321350,0,1.79242942953014,QC d2 v2,QC d1 v1,100 - 6.02,702.042566,13.79
14.04_302.2425n,302.242496,337.233515,1,842.132,0.936183,0,1.75803337976152,QC d2 v1,QC d1 v2,100 - 10.5,1839.201053,3.02
