In [4]:
import pandas as pd
import numpy as np

In [5]:
def filter_microbiome_data(abundance_data, 
                          min_library_size=1000,
                          min_taxa_per_sample=10,
                          min_prevalence=0.2,
                          #min_prevalence=0.2,
                          min_abundance_threshold=0.002,
                          min_counts=2,
                          min_samples_with_counts=2,
                          verbose=True):
    
    if verbose:
        print(f"Initial data shape: {abundance_data.shape}")
        print(f"Initial sparsity: {(abundance_data == 0).sum().sum() / abundance_data.size:.1%}")
    
    sample_library_sizes = abundance_data.sum(axis=1)
    sample_taxa_richness = (abundance_data > 0).sum(axis=1)
    
    keep_samples = (sample_library_sizes >= min_library_size) & (sample_taxa_richness >= min_taxa_per_sample)
    
    if verbose:
        print(f"\nSample filtering:")
        print(f"  Removing {sum(~keep_samples)} samples")
        print(f"  Keeping {sum(keep_samples)} samples")
    
    filtered_data = abundance_data.loc[keep_samples, :]
    
    rel_abundance = filtered_data.div(filtered_data.sum(axis=1), axis=0)
    
    taxa_prevalence = (filtered_data > 0).mean(axis=0)
    taxa_max_abundance = rel_abundance.max(axis=0)
    taxa_min_counts = (filtered_data >= min_counts).sum(axis=0)
    
    keep_taxa = (
        (taxa_prevalence >= min_prevalence) &
        (taxa_max_abundance >= min_abundance_threshold) &
        (taxa_min_counts >= min_samples_with_counts)
    )
    
    if verbose:
        print(f"\nTaxa filtering:")
        print(f"  Prevalence filter: {sum(taxa_prevalence >= min_prevalence)} taxa pass")
        print(f"  Abundance filter: {sum(taxa_max_abundance >= min_abundance_threshold)} taxa pass") 
        print(f"  Count filter: {sum(taxa_min_counts >= min_samples_with_counts)} taxa pass")
        print(f"  Combined: keeping {sum(keep_taxa)} taxa, removing {sum(~keep_taxa)} taxa")
    
    final_data = filtered_data.loc[:, keep_taxa]
    
    if verbose:
        print(f"\nFinal data shape: {final_data.shape}")
        print(f"Final sparsity: {(final_data == 0).sum().sum() / final_data.size:.1%}")
        print(f"Data reduction: {(1 - final_data.size / abundance_data.size):.1%}")
    
    return final_data




# means = filtered_df.mean()
# variances = filtered_df.var()
# dispersion = variances / means
# results_df = pd.DataFrame({
#     'Species': means.index,
#     'Mean': means.round(2),
#     'Variance': variances.round(2),
#     'Phi': dispersion.round(2)
# })

# print(results_df)

## HMPv13_otu_table

In [6]:
# Project ID : 1928
# Subject ID: SRS 
# Experiment ID: SRX
# Run ID: SRR

df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/HMPv13_otu_table.csv').T
#df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/HMPv35_otu_table.csv').T
# Convert the index to a series and split it
id_df = df.index.to_series().str.split('.', expand=True)
# Rename the columns as per your requirement
id_df.columns = ['Project_ID', 'SRS_ID', 'SRX_ID', 'SRR_ID']
# Now 'id_df' has the separate columns. You can join this back to your original DataFrame if needed
# join infront of the df
df = df.join(id_df).reset_index(drop=True)
df = df.sort_values('SRX_ID')
# drop the columns that are not needed
df = df.drop(['Project_ID', 'SRS_ID', 'SRR_ID'], axis=1)
# rename the SRX_ID column to Sample_ID
df = df.rename(columns={'SRX_ID': 'Group_ID'}).reset_index(drop=True)

# give the group ID a number each
df['Group_ID'] = (pd.factorize(df['Group_ID'])[0] + 1)


# Separate abundance data from Group_ID
abundance_cols = df.columns[df.columns != 'Group_ID']
abundance_data = df[abundance_cols]

# Apply literature-based filtering
filtered_abundance = filter_microbiome_data(abundance_data)

# Combine back with Group_ID
filtered_df = pd.concat([df.loc[filtered_abundance.index, ['Group_ID']], filtered_abundance], axis=1)

# drop the Group_ID column
# df = df.drop(['Group_ID'], axis=1)
# df = np.log(df + 1)
# df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/HMPv13_otu_table_log.csv')
df

Initial data shape: (3285, 5830)
Initial sparsity: 98.2%

Sample filtering:
  Removing 670 samples
  Keeping 2615 samples

Taxa filtering:
  Prevalence filter: 182 taxa pass
  Abundance filter: 2519 taxa pass
  Count filter: 3099 taxa pass
  Combined: keeping 182 taxa, removing 5648 taxa

Final data shape: (2615, 182)
Final sparsity: 64.2%
Data reduction: 97.5%


Unnamed: 0,4213913,4330849,4400869,4457085,4358020,607045,290144,4440970,401717,39443,...,199091,368261,2123717,198945,302494,191112,174415,573110,1868703,Group_ID
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3280,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,71
3281,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,71
3282,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,71
3283,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,71


In [8]:
filtered_df.to_csv('/projects/genomic-ml/da2343/PLN/pln_eval/data/HMPv13_filtered.csv', index=False)

## MovingPictures_otu_table

In [None]:
df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/MovingPictures_otu_table.csv').T
id_df = df.index.to_series().str.split('.', expand=True)
id_df.columns = ['Project_ID', 'Sample_ID', 'S_Constant', 'Group_ID', 'Sequence_Keyword']
df = df.join(id_df).reset_index(drop=True)
df = df.sort_values('Group_ID')
df = df.drop(['Project_ID', 'Sample_ID', 'S_Constant', 'Sequence_Keyword'], axis=1).reset_index(drop=True)

# drop the Group_ID column
# df = df.drop(['Group_ID'], axis=1)
# df = np.log(df + 1)
# df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/MovingPictures_otu_table_log.csv')
df

In [None]:
df['Group_ID'] = df['Group_ID'].astype(int)
#df = df[df['Group_ID'] <= 3]
column_sums = df.sum(axis=0).sort_values(ascending=False)
top_20_columns = column_sums.head(TOP_COLS).index
df_selected = df[top_20_columns]
df_selected = np.log10(df_selected + 1)
df_selected['Group_ID'] = df['Group_ID']

# First, find the minimum group size
min_group_size = df_selected['Group_ID'].value_counts().min()
# but min_group_size should be a multiple of the cv constant which is 5
min_group_size = min_group_size - (min_group_size % CV)
# Now, use groupby and sample to get equal-sized samples from each group
equal_samples_df = df_selected.groupby('Group_ID').apply(lambda x: x.sample(min_group_size)).reset_index(drop=True)
equal_samples_df

In [None]:
equal_samples_df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/MovingPictures_11_15.csv', index=False)

## qa10394_otu_table

In [None]:
df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/qa10394_otu_table.csv').T
id_df = df.index.to_series().str.split('.', expand=True)
# use only column with index 1
id_df = id_df.iloc[:, 1]
id_df = id_df.rename('Group_ID')
df = df.join(id_df).reset_index(drop=True)
df = df.sort_values('Group_ID')
df = df[(df.Group_ID != "BLANK") & (df.Group_ID != "mistake")]
df = df.reset_index(drop=True)


# drop the Group_ID column
#df = df.drop(['Group_ID'], axis=1)
#df = np.log(df + 1)
#df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/qa10394_otu_table_log.csv')
df

In [None]:
df['Group_ID'] = (pd.factorize(df['Group_ID'])[0] + 1)

df['Group_ID'] = df['Group_ID'].astype(int)
#df = df[df['Group_ID'] <= TOP_GRPS]

column_sums = df.sum(axis=0).sort_values(ascending=False)
top_20_columns = column_sums.head(TOP_COLS).index
df_selected = df[top_20_columns]
df_selected = np.log10(df_selected + 1)
df_selected['Group_ID'] = df['Group_ID']
df_selected

In [None]:
# First, find the minimum group size
min_group_size = df_selected['Group_ID'].value_counts().min()
# but min_group_size should be a multiple of the cv constant which is 5
min_group_size = min_group_size - (min_group_size % CV)
# Now, use groupby and sample to get equal-sized samples from each group
equal_samples_df = df_selected.groupby('Group_ID').apply(lambda x: x.sample(min_group_size)).reset_index(drop=True)
equal_samples_df

In [None]:
equal_samples_df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/qa10394_11_15.csv', index=False)

## TwinsUK_otu_table

In [None]:
df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/TwinsUK_otu_table.csv').T
id_df = df.index.to_series().str.split('.', expand=True)
id_df = id_df.iloc[:, 1]
id_df = id_df.rename('Group_ID')
df = df.join(id_df).reset_index(drop=True)
df = df.sort_values('Group_ID').reset_index(drop=True)


# drop the Group_ID column
#df = df.drop(['Group_ID'], axis=1)
#df = np.log(df + 1)
#df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/TwinsUK_otu_table_log.csv')


df['Group_ID'] = (pd.factorize(df['Group_ID'])[0] + 1)
df['Group_ID'] = df['Group_ID'].astype(int)
# df = df[df['Group_ID'] <= TOP_GRPS]



column_sums = df.sum(axis=0).sort_values(ascending=False)
top_20_columns = column_sums.head(TOP_COLS).index
df_selected = df[top_20_columns]
df_selected = np.log10(df_selected + 1)
df_selected['Group_ID'] = df['Group_ID']
df_selected

In [None]:
# First, find the minimum group size
min_group_size = df_selected['Group_ID'].value_counts().min()
# but min_group_size should be a multiple of the cv constant which is 5
min_group_size = min_group_size - (min_group_size % CV)
# Now, use groupby and sample to get equal-sized samples from each group
equal_samples_df = df_selected.groupby('Group_ID').apply(lambda x: x.sample(min_group_size)).reset_index(drop=True)
equal_samples_df

In [None]:
equal_samples_df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/TwinsUK_11_15.csv', index=False)

## NECROMASS

In [None]:
df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/necromass_sub_log_update.csv')
df = df.sort_values('Group_ID').reset_index(drop=True)

# Exclude Group_ID column when calculating sums
numeric_columns = df.drop('Group_ID', axis=1)
column_sums = numeric_columns.sum(axis=0).sort_values(ascending=False)

# Select top columns and include Group_ID from the start
columns_to_keep = list(column_sums.head(TOP_COLS).index) + ['Group_ID']
df_selected = df[columns_to_keep].copy()  # Create a clean copy

df_selected

In [None]:
df_selected.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/necromass_11_15.csv', index=False)