# Manual for metabolomics data filtering using phloemfinder

In [None]:
import numpy as np

from phloemfinder.metabolome_analysis import MetaboliteAnalysis


## Importing the unfiltered data

In [None]:
negative_mode_set1 = MetaboliteAnalysis(
    metabolome_csv="../.path/to/data/negative_mode_set1_data.csv",
    metabolome_feature_id_col="feature_id")

negative_mode_set1.validate_input_metabolome_df()

print("Number of total features before blank filtering = {0}".format(negative_mode_set1.metabolome.shape[0]))

## Filtering features
There are several filtering options.

### Filter features based on blanks
The first is to remove all features that are present in the blanks. This is a recomended step, because these features are most likely background noise or components from the sample collection buffer.

In [None]:
negative_mode_set1.discard_features_detected_in_blanks(blank_sample_contains="blank")
print("numberof total features after blank filtering = {0}".format(negative_mode_set1.metabolome.shape[0]))

### Density plots per group
To get an idea of the distribution of your data, you can visualise is with density plots

In [None]:
negative_mode_set1.create_density_plot()

### Filter features by percentile
An optional filtering step is the filtering by percentile. With this step, only the most abundand features per genotype will remain.

In [None]:
negative_mode_set1.filter_features_per_group_by_percentile(percentile=95)
print("number of features after feature filtering by percentile= {0}".format(negative_mode_set1.metabolome.shape[0]))

### Filter unreliable features
Another option is to filter out the unreliable features. This means that you remove all features that are not present in at least n samples of at least 1 genotype.

In [None]:
negative_mode_set1.filter_out_unreliable_features(nb_times_detected=4)
print("numberof total features after unreliable feature filtering = {0}".format(negative_mode_set1.metabolome.shape[0]))

## saving the filtered data
If you are happy with the filtered data, you can save it as a .csv file

In [None]:
negative_mode_set1.write_clean_metabolome_to_csv(path_of_cleaned_csv="./clean_negative_set1_n4_p95.csv")

## data visualization
### PCA

In [None]:
negative_mode_set1.compute_pca_on_metabolites(n_principal_components=10)
negative_mode_set1.pca_performed
negative_mode_set1.create_scree_plot()

In [None]:
negative_mode_set1.metabolome_pca_reduced.shape

In [None]:
negative_mode_set1.create_sample_score_plot(
    pc_x_axis=1,
    pc_y_axis=2,
    name_grouping_var='genotype')

### UpSet plot
The current version of this function transforms your data, so be carefull if you use it before saving the filtered data or finishing the filtering steps.

In [None]:
negative_mode_set1.plot_features_in_upset_plot()