In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.utils import load_config, seed_everything, calculate_cv_per_class, calculate_d_ratio, calculate_detection_rate, correct_blank, get_low_cv_features, get_high_detection_rate_features, get_high_mz_features

In [2]:
# for reproducibility
seed_everything(42)

# config file
config = load_config("../configs/config.yaml")

# Set Seaborn style
sns.set(style=config['sns_params']['style'],
        palette=config['sns_params']['palette'],
        font_scale=config['sns_params']['font_scale'])

# Set Matplotlib parameters
plt.rcParams.update(config['plt_params'])

### Read files 

In [3]:
sample_meta = pd.read_csv(config['paths']['sample_meta_path'])
sample_meta = sample_meta[sample_meta['batch']==1]

data_mat = pd.read_csv(config['paths']['data_mat_path'])
data_mat = data_mat[data_mat['sample'].isin(sample_meta['sample'].values.tolist())]

feat_meta = pd.read_csv(config['paths']['feat_meta_path'])

ft_columns = [col for col in data_mat.columns if col.startswith('FT')]

### Filters

In [4]:
# filter 1: CV < 0.3
low_cv_features = get_low_cv_features(data_mat, sample_meta, ft_columns, threshold=0.3)
len(low_cv_features)


193

In [5]:
# filter 2: detection rate > 0.7
high_detection_rate_features = get_high_detection_rate_features(data_mat, sample_meta, ft_columns, bio_samples=['LMU', 'French', 'Dunn'], threshold=0.7)
len(high_detection_rate_features)

252

In [6]:
# filter 3: mz > 500
features_in_mass_range = get_high_mz_features(feat_meta, mz_threshold=500)
len(features_in_mass_range)

248

In [None]:
# apply all filters
pass_features = set(low_cv_features) & set(high_detection_rate_features) & set(features_in_mass_range)
len(pass_features)