### Analysis with Longitudinal

Making a metadata file that only contains patients with samples pre- and post-abduction so comparison is possible (cohort 1 & 2)

In [None]:
#filter metadata for patients in both samples

# Load metadata
metadata = pd.read_csv(f"{data_dir}/metadata.tsv", sep='\t')

# Ensure `Cohort_Number` contains both 1 (pre) and 2 (post) for comparison
valid_patients = metadata.groupby('Patient_ID')['Cohort_Number'].apply(lambda x: set(x)).reset_index()
valid_patients = valid_patients[valid_patients['Cohort_Number'].apply(lambda x: {1, 2}.issubset(x))]

# Filter metadata for these patients
metadata_pre_post = metadata[metadata['Patient_ID'].isin(valid_patients['Patient_ID'])]

# New row to add at the beginning
# Corrected dictionary with placeholders as values
new_row = {
    'sample-id': '#q2:types',  # This could represent a placeholder for a sample type
    'Patient_ID': 'categorical',  # Placeholder for categorical data type
    'Stool_Consistency': 'categorical',  # Placeholder for categorical data type
    'Patient_Sex': 'categorical',  # Placeholder for categorical data type
    'Sample_Day': 'numeric',  # Placeholder for numerical data type
    'Recovery_Day': 'numeric',  # Placeholder for numerical data type
    'Cohort_Number': 'numeric'  # Placeholder for categorical data type
}

# Convert the new row to a DataFrame
new_row_df = pd.DataFrame([new_row])

# Concatenate the new row with the original DataFrame
metadata_pre_post = pd.concat([new_row_df, metadata_pre_post], ignore_index=True)

# Save the filtered metadata
metadata_pre_post.to_csv(f"{data_dir}/metadata_pre_post.tsv", sep='\t', index=False)

pd.read_csv(f"{data_dir}/metadata_pre_post.tsv", sep='\t')

In [None]:
#Importing the metadata from the Shannon and Faith PD results
Shannon_categorical = pd.read_csv(f'{data_dir}/core-metrics-results-bt/shannon-group-significance_exported/metadata.tsv', sep='\t')
FaithPD_categorical = pd.read_csv(f'{data_dir}/core-metrics-results-bt/faith-pd-group-significance_exported/metadata.tsv', sep='\t')
metadata = pd.read_csv(f"{data_dir}/metadata.tsv", sep="\t")

#Merging both tables for easier handling and changing Shannon Entropy and Faith PD to numerical for plotting
categorical = pd.merge(Shannon_categorical, FaithPD_categorical, how='inner', on=['Patient_ID', 'id', 'Patient_Sex', 'Stool_Consistency'])
categorical = categorical.loc[categorical.index != 0]
categorical = categorical.sort_values(by="Patient_ID", ascending=True) 
categorical['shannon_entropy'] = pd.to_numeric(categorical['shannon_entropy'], errors='coerce')
categorical['faith_pd'] = pd.to_numeric(categorical['faith_pd'], errors='coerce')
categorical.shape
categorical.rename(columns={'id': 'sample-id'}, inplace=True)
metadata_alpha = pd.merge(metadata, categorical,  how='left', on=['Patient_ID', 'sample-id', 'Patient_Sex', 'Stool_Consistency'])
metadata_alpha.to_csv(f"{data_dir}/metadata_alpha.tsv", sep="\t", index=False)
metadata_alpha

metadata_pre_post_alpha = pd.merge(metadata_pre_post, categorical, how='left', on=['Patient_ID', 'sample-id', 'Patient_Sex', 'Stool_Consistency'])
metadata_pre_post_alpha.to_csv(f"{data_dir}/metadata_pre_post_alpha.tsv", sep="\t", index=False)
pd.read_csv(f"{data_dir}/metadata_pre_post_alpha.tsv", sep='\t')

In [None]:
#Creates a FeatureTable[RelativeFrequency] that is needed for the qiime longitudinal pairwise-differences command
! qiime feature-table relative-frequency \
  --i-table $data_dir/table-filtered.qza \
  --o-relative-frequency-table $data_dir/relative-frequency-table.qza

### Testing alpha diversity differences between cohorts pairwise

In [None]:
! qiime longitudinal pairwise-differences \
  --i-table $data_dir/relative-frequency-table.qza \
  --m-metadata-file $data_dir/metadata_pre_post_alpha.tsv \
  --p-state-column Cohort_Number \
  --p-state-1 1 \
  --p-state-2 2 \
  --p-group-column Patient_Sex \
  --p-individual-id-column Patient_ID \
  --p-replicate-handling random \
  --p-metric shannon_entropy \
  --o-visualization $data_dir/pairwise_differences_pre_post_shannon.qzv
 

In [None]:
! qiime tools view $data_dir/pairwise_differences_pre_post_shannon.qzv

This might mean than the difference in alpha diversity for each patient is significantly less in cohort 2 compared to cohort 1

In [None]:
! qiime longitudinal pairwise-differences \
  --m-metadata-file $data_dir/metadata_pre_post_alpha.tsv \
  --p-metric faith_pd \
  --p-state-column Cohort_Number \
  --p-state-1 1 \
  --p-state-2 2 \
  --p-individual-id-column Patient_ID \
  --p-replicate-handling random \
  --o-visualization $data_dir/pairwise_differences_pre_post_faith_pd.qzv

# --p-group-column *insert grouping column* \

In [None]:
! qiime tools view $data_dir/pairwise_differences_pre_post_faith_pd.qzv

### Testing changes of feature abundance on a single patient level

In [None]:
#Importing the metadata from the Shannon and Faith PD results
Shannon_categorical = pd.read_csv(f'{data_dir}/core-metrics-results-bt/shannon-group-significance_exported/metadata.tsv', sep='\t')
FaithPD_categorical = pd.read_csv(f'{data_dir}/core-metrics-results-bt/faith-pd-group-significance_exported/metadata.tsv', sep='\t')
metadata = pd.read_csv(f"{data_dir}/metadata.tsv", sep="\t")

#Merging both tables for easier handling and changing Shannon Entropy and Faith PD to numerical for plotting
categorical = pd.merge(Shannon_categorical, FaithPD_categorical, how='inner', on=['Patient_ID', 'id', 'Patient_Sex', 'Stool_Consistency'])
categorical = categorical.loc[categorical.index != 0]
categorical = categorical.sort_values(by="Patient_ID", ascending=True) 
categorical['shannon_entropy'] = pd.to_numeric(categorical['shannon_entropy'], errors='coerce')
categorical['faith_pd'] = pd.to_numeric(categorical['faith_pd'], errors='coerce')
categorical.shape
categorical.rename(columns={'id': 'sample-id'}, inplace=True)
metadata_alpha = pd.merge(metadata, categorical,  how='left', on=['Patient_ID', 'sample-id', 'Patient_Sex', 'Stool_Consistency'])
metadata_alpha.to_csv(f"{data_dir}/metadata_alpha.tsv", sep="\t", index=False)
metadata_alpha

In [None]:
#filtering for features that are abundant in at least 10 patients
! qiime feature-table filter-features \
  --i-table $data_dir/table-filtered.qza \
  --p-min-samples 10 \
  --o-filtered-table $data_dir/table-filtered-min-abund.qza


In [None]:
#volatility

! qiime longitudinal feature-volatility \
  --i-table $data_dir/table-filtered-min-abund.qza  \
  --m-metadata-file $data_dir/metadata_alpha.tsv \
  --p-state-column Cohort_Number \
  --p-individual-id-column Patient_ID \
  --p-n-estimators 10 \
  --p-random-state 17 \
  --output-dir $data_dir/feat-volatility-min

In [None]:
! qiime tools view $data_dir/feat-volatility-min/volatility_plot.qzv

In [None]:
! qiime tools export \
    --input-path $data_dir/feat-volatility-min/volatility_plot.qzv \
    --output-path $data_dir/feat-volatility-min/volatility_plot-exported

In [None]:
volatility = pd.read_csv(f"{data_dir}/feat-volatility-min/volatility_plot-exported/data.tsv", sep='\t')

volatility.head()

In [None]:
change = pd.read_csv(f"{data_dir}/feat-volatility-min/volatility_plot-exported/feature_metadata.tsv", sep='\t')

# Convert 'importance' to numeric, setting non-numeric values to NaN
change['importance'] = pd.to_numeric(change['importance'], errors='coerce')

# Filter for numeric values > 0
change[change['importance'] > 0]


### Testing differences in relative abundance of features between cohorts pairwise

In [None]:
#Creates a FeatureTable[RelativeFrequency] that is needed for the qiime longitudinal pairwise-differences command
! qiime feature-table relative-frequency \
  --i-table $data_dir/table-filtered-min-abund.qza \
  --o-relative-frequency-table $data_dir/relative-frequency-table-min-abund.qza

Shows a list of all the feature names

In [None]:
# Load the FeatureTable[RelativeFrequency]
feature_table = Artifact.load(f"{data_dir}/relative-frequency-table.qza")
# Extract the feature table as a Pandas DataFrame
table = feature_table.view(pd.DataFrame)
# Get the list of feature IDs
features = table.columns.tolist()

# Load the FeatureTable[RelativeFrequency]
feature_table_filtered = Artifact.load(f"{data_dir}/relative-frequency-table-min-abund.qza")
# Extract the feature table as a Pandas DataFrame
table_filtered = feature_table_filtered.view(pd.DataFrame)
# Get the list of feature IDs
features_filtered = table_filtered.columns.tolist()
len(features_filtered)

print("The feature table originally contained", len(features), "features and after filtering for features that are present in at least 10 samples", len(features_filtered),"features remain")

Analysis if the relative abundance of a feature changes between pre- and post-abduction pairwise

In [None]:
#can be used to check if the difference in abundance of a specific feature is actually significantly
! qiime longitudinal pairwise-differences \
  --i-table $data_dir/relative-frequency-table-min-abund.qza \
  --m-metadata-file $data_dir/metadata_pre_post_alpha.tsv \
  --p-metric 3bb5ef006f820704249b8e9ca9597079	 \
  --p-state-column Cohort_Number \
  --p-state-1 1 \
  --p-state-2 2 \
  --p-individual-id-column Patient_ID \
  --p-replicate-handling random \
  --o-visualization $data_dir/pairwise_differences_pre_post_feature.qzv

! qiime tools view $data_dir/pairwise_differences_pre_post_feature.qzv