In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.utils import load_config, seed_everything, calculate_cv_per_class, calculate_d_ratio, calculate_detection_rate, correct_blank

In [2]:
# for reproducibility
seed_everything(42)

# config file
config = load_config("../configs/config.yaml")

# Set Seaborn style
sns.set(style=config['sns_params']['style'],
        palette=config['sns_params']['palette'],
        font_scale=config['sns_params']['font_scale'])

# Set Matplotlib parameters
plt.rcParams.update(config['plt_params'])

In [4]:
data_mat = pd.read_csv(config['paths']['data_mat_path'])
feat_meta = pd.read_csv(config['paths']['feat_meta_path'])
sample_meta = pd.read_csv(config['paths']['sample_meta_path'])
exog_stand = pd.read_csv(config['paths']['exog_stan_path'])

In [5]:
from tidyms import DataContainer
data_cont = DataContainer(data_matrix = data_mat.set_index('sample'),
                         feature_metadata = feat_meta.set_index('feature'),
                         sample_metadata = sample_meta.set_index('sample'),
                         mapping = config['mapping'])



In [17]:
(data_cont.metrics.dratio(robust=False) < 1).sum()

np.int64(17)

In [14]:
qc_samples = sample_meta[(sample_meta['class'].isin(['QC']))]['sample']
bio_samples = sample_meta[sample_meta['class'].isin(['Dunn', 'French', 'LMU'])]['sample']

# Combine results into a summary DataFrame
d_ratio = calculate_d_ratio(data_mat, qc_samples, bio_samples)
d_ratio

Unnamed: 0,Feature,QC_CV,Bio_CV,D_Ratio
0,FT-000,0.475538,0.107050,0.970626
1,FT-001,0.407507,0.101771,0.972748
2,FT-002,1.032250,0.281652,0.856159
3,FT-003,0.335532,0.099530,0.965852
4,FT-004,1.056872,0.282348,0.862934
...,...,...,...,...
247,FT-247,1.207309,0.460630,0.811324
248,FT-248,0.803797,0.199504,0.922324
249,FT-249,1.085948,0.296631,0.866866
250,FT-250,1.052998,0.215077,0.931349


In [19]:
(d_ratio['D_Ratio']<0.5).sum()

np.int64(9)

In [21]:
data_cont.metrics.cv()

Unnamed: 0_level_0,FT-000,FT-001,FT-002,FT-003,FT-004,FT-005,FT-006,FT-007,FT-008,FT-009,...,FT-242,FT-243,FT-244,FT-245,FT-246,FT-247,FT-248,FT-249,FT-250,FT-251
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B,1.643249,1.529699,inf,2.0431,inf,inf,inf,inf,inf,inf,...,inf,3.316625,2.253248,3.316625,inf,inf,2.338963,inf,inf,inf
Dunn,0.083354,0.090343,0.312634,0.091189,0.310834,0.291509,0.31227,0.150186,0.141874,0.133381,...,0.347098,0.202888,0.201829,0.285717,0.383964,0.249434,0.143965,0.242138,0.161023,1.558218
French,0.113385,0.094959,0.214364,0.092923,0.216498,0.172595,0.200124,0.092619,0.091243,0.090788,...,0.279395,0.243327,0.241317,0.352181,0.208867,0.459429,0.130532,0.211294,0.136625,0.269884
LMU,0.11932,0.115566,0.30954,0.110299,0.307217,0.264032,0.293647,0.144113,0.142729,0.136453,...,0.232675,0.262654,0.198125,0.308745,2.713183,0.430967,0.175193,0.2599,0.176652,1.54299
QC,0.475538,0.407507,1.03225,0.335532,1.056872,1.052403,1.058605,1.019034,1.039791,1.040915,...,0.99223,1.009704,1.066782,0.985666,1.531976,1.207309,0.803797,1.085948,1.052998,1.432441
SS,0.320862,0.34879,1.414214,0.263458,1.414214,1.414214,1.414214,1.414214,1.414214,1.414214,...,inf,inf,inf,inf,1.414214,1.414214,inf,inf,inf,1.414214
dQC,0.316362,0.31009,1.14973,0.304755,1.168879,1.152869,1.167743,1.019173,1.052552,1.054757,...,1.077021,1.180627,1.095578,1.118832,3.022084,1.064716,0.82706,1.111774,1.10426,2.782772


In [27]:
sample_meta = pd.read_csv(config['paths']['sample_meta_path'])
sample_meta = sample_meta[sample_meta['batch']==1]

data_mat = pd.read_csv(config['paths']['data_mat_path'])
data_mat = data_mat[data_mat['sample'].isin(sample_meta['sample'].values.tolist())]

feat_meta = pd.read_csv(config['paths']['feat_meta_path'])

In [28]:
from tidyms import DataContainer
data_cont = DataContainer(data_matrix = data_mat.set_index('sample'),
                         feature_metadata = feat_meta.set_index('feature'),
                         sample_metadata = sample_meta.set_index('sample'),
                         mapping = {'blank': ['B'], 'sample': ['Dunn', 'French', 'LMU'], 'qc': ['QC', 'dQC']})

In [6]:
config['mapping']

{'blank': ['B'], 'sample': ['Dunn', 'French', 'LMU'], 'qc': ['QC', 'dQC']}

In [7]:
#data_cont.preprocess.correct_batches(min_qc_dr=0.8, verbose=False)

In [8]:
import tidyms as ms


In [9]:
# remove blank and conditioning QC samples
class_filter = ms.filter.ClassRemover(["B"])
# remove features high a %RSD higher than 20 % in the QC samples
vf = ms.filter.VariationFilter(ub=0.3, robust=False)
# remove features that are not detected in all study samples
pf = ms.filter.PrevalenceFilter(lb=0.7, threshold=0)
# remove features with low biological variation
drf= ms.filter.DRatioFilter(ub=0.1, robust=True)

# Build and apply the data curation pipeline
processors = [vf, pf]
pipeline = ms.filter.Pipeline(processors, verbose=True)
pipeline.process(data_cont)

Applying Variation Filter: 59 features removed, 0 samples removed, Mean CV reduced by 1.21 %.
Applying Prevalence Filter: 0 features removed, 0 samples removed, Mean CV reduced by 0.00 %.


In [29]:
q = data_cont.metrics.cv()

In [30]:
q.index

Index(['B', 'Dunn', 'French', 'LMU', 'QC', 'SS', 'dQC'], dtype='object', name='class')

In [32]:
(q.iloc[q.index.isin(['QC', 'dQC']), : ] < 0.3).all().sum()

np.int64(158)