In [1]:
import pandas as pd
import numpy as np

In [2]:
CV = 5
TOP_COLS = 30
TOP_GRPS = 20

## HMPv13_otu_table

In [None]:
# Project ID : 1928
# Subject ID: SRS 
# Experiment ID: SRX
# Run ID: SRR

# df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/HMPv13_otu_table.csv').T
df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/HMPv35_otu_table.csv').T
# Convert the index to a series and split it
id_df = df.index.to_series().str.split('.', expand=True)
# Rename the columns as per your requirement
id_df.columns = ['Project_ID', 'SRS_ID', 'SRX_ID', 'SRR_ID']
# Now 'id_df' has the separate columns. You can join this back to your original DataFrame if needed
# join infront of the df
df = df.join(id_df).reset_index(drop=True)
df = df.sort_values('SRX_ID')
# drop the columns that are not needed
df = df.drop(['Project_ID', 'SRS_ID', 'SRR_ID'], axis=1)
# rename the SRX_ID column to Sample_ID
df = df.rename(columns={'SRX_ID': 'Group_ID'}).reset_index(drop=True)

# give the group ID a number each
df['Group_ID'] = (pd.factorize(df['Group_ID'])[0] + 1)

# drop the Group_ID column
# df = df.drop(['Group_ID'], axis=1)
# df = np.log(df + 1)
# df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/HMPv13_otu_table_log.csv')

df['Group_ID'] = df['Group_ID'].astype(int)
df

# df_selected is without the Group_ID column
df_selected = df.drop(['Group_ID'], axis=1) 
df_selected = np.log10(df_selected + 1)
df_selected['Group_ID'] = df['Group_ID']
df_selected

# First, find the minimum group size
min_group_size = int(df_selected['Group_ID'].value_counts().mean())

# but min_group_size should be a multiple of the cv constant which is 5
min_group_size = min_group_size - (min_group_size % CV)

# remove all groups that are smaller than the min_group_size
df_selected = df_selected.groupby('Group_ID').filter(lambda x: len(x) >= min_group_size)

# Now, use groupby and sample to get equal-sized samples from each group
equal_samples_df = df_selected.groupby('Group_ID').apply(lambda x: x.sample(min_group_size)).reset_index(drop=True)

# arrange Group_ID in ascending order
equal_samples_df = equal_samples_df.sort_values('Group_ID').reset_index(drop=True)
equal_samples_df

In [36]:
# equal_samples_df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/HMPv13_sub_log_update.csv', index=False)
equal_samples_df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/HMPv35_sub_log_update.csv', index=False)

In [6]:
import pandas as pd
import numpy as np

# Read and transform the data
# df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/HMPv13_otu_table.csv').T
df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/HMPv35_otu_table.csv').T

# Process IDs
id_df = df.index.to_series().str.split('.', expand=True)
id_df.columns = ['Project_ID', 'SRS_ID', 'SRX_ID', 'SRR_ID']
df = df.join(id_df).reset_index(drop=True)
df = df.sort_values('SRX_ID')
df = df.drop(['Project_ID', 'SRS_ID', 'SRR_ID'], axis=1)
df = df.rename(columns={'SRX_ID': 'Group_ID'}).reset_index(drop=True)
df['Group_ID'] = (pd.factorize(df['Group_ID'])[0] + 1)

# Convert Group_ID to int and filter by TOP_GRPS
df['Group_ID'] = df['Group_ID'].astype(int)


def filter_informative_columns(df, zero_threshold=0.6, n_top_columns=30):
    """
    Filter columns based on:
    1. Percentage of zero entries (remove if too many zeros)
    2. Sort by non-zero entries and take top columns
    
    Parameters:
    - df: DataFrame
    - zero_threshold: maximum allowed fraction of zeros (default 0.8)
    - n_top_columns: number of top columns to select (default 20)
    
    Returns:
    - DataFrame with filtered columns
    """
    # Separate Group_ID
    group_id = df['Group_ID']
    data_cols = df.drop('Group_ID', axis=1)
    
    # Calculate fraction of zeros in each column
    zero_fractions = (data_cols == 0).mean()
    
    # Get columns that pass the zero threshold
    mask = (zero_fractions < zero_threshold)
    
    # Sort columns by non-zero fraction and take top n
    scores = (1 - zero_fractions[mask])
    top_cols = scores.sort_values(ascending=False).head(n_top_columns).index
    
    # Create final DataFrame
    df_filtered = data_cols[top_cols]
    df_filtered = np.log10(df_filtered + 1)
    df_filtered['Group_ID'] = group_id
    
    # Print some information about the filtering
    print(f"Original number of columns: {len(data_cols.columns)}")
    print(f"Columns after zero threshold: {len(scores)}")
    print(f"Final number of columns: {len(top_cols)}")
    
    return df_filtered

# Apply the filtering
df_selected = filter_informative_columns(df)

# First, find the minimum group size
min_group_size = int(df_selected['Group_ID'].value_counts().mean())

# but min_group_size should be a multiple of the cv constant which is 5
min_group_size = min_group_size - (min_group_size % CV)

# remove all groups that are smaller than the min_group_size
df_selected = df_selected.groupby('Group_ID').filter(lambda x: len(x) >= min_group_size)

# Now, use groupby and sample to get equal-sized samples from each group
equal_samples_df = df_selected.groupby('Group_ID').apply(lambda x: x.sample(min_group_size)).reset_index(drop=True)

# arrange Group_ID in ascending order
equal_samples_df = equal_samples_df.sort_values('Group_ID').reset_index(drop=True)

# Calculate group sparsity and select top groups based on least sparse groups
group_sparsity = equal_samples_df.groupby('Group_ID').apply(
    lambda x: (x.drop('Group_ID', axis=1) == 0).mean().mean()
).sort_values()  # Sort groups by their sparsity

# Take the top N least sparse groups
top_groups = group_sparsity.head(TOP_GRPS).index
equal_samples_df = equal_samples_df[equal_samples_df['Group_ID'].isin(top_groups)]
equal_samples_df

Original number of columns: 10730
Columns after zero threshold: 35
Final number of columns: 30


Unnamed: 0,879972,341460,858896,561636,967427,949789,98605,4309323,4438988,4422456,...,4346977,4447394,4374753,3864823,1082539,12574,1089121,3801267,4405869,Group_ID
175,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6
176,2.309630,2.710963,1.903090,1.447158,1.643453,1.875061,0.000000,0.301030,0.477121,1.579784,...,0.602060,0.301030,0.000000,0.000000,0.000000,0.301030,1.113943,1.924279,0.000000,6
177,3.031408,1.255273,2.033424,1.612784,1.431364,0.000000,0.301030,0.000000,0.301030,0.301030,...,0.698970,0.000000,0.000000,0.000000,0.845098,0.000000,0.000000,0.000000,0.000000,6
178,2.252853,2.675778,1.643453,1.230449,1.690196,1.724276,2.086360,1.949390,0.301030,1.875061,...,0.301030,0.698970,2.518514,1.477121,1.146128,0.000000,0.698970,2.025306,0.477121,6
179,2.113943,2.235528,0.778151,0.301030,0.845098,0.698970,0.845098,1.556303,0.301030,1.819544,...,0.000000,0.000000,1.991226,0.778151,0.000000,0.778151,0.477121,2.227887,0.698970,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1885,3.595165,2.225309,2.599883,2.139879,0.602060,0.477121,0.301030,0.477121,0.301030,0.602060,...,1.748188,0.000000,1.204120,0.301030,0.000000,0.000000,1.301030,0.000000,2.093422,130
1886,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,130
1887,3.161068,2.633468,2.610660,1.707570,2.267172,2.107210,1.204120,1.361728,1.732394,2.068186,...,0.845098,0.845098,1.913814,0.602060,0.602060,1.491362,2.123852,2.294466,0.301030,130
1888,3.296007,2.600973,2.702431,1.924279,2.418301,2.220108,1.531479,1.770852,1.732394,2.230449,...,1.204120,0.000000,1.556303,0.845098,1.000000,1.806180,2.610660,2.518514,0.602060,130


In [7]:
equal_samples_df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/HMPv35_11_15.csv', index=False)

In [None]:
equal_samples_df

## MovingPictures_otu_table

In [None]:
df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/MovingPictures_otu_table.csv').T
id_df = df.index.to_series().str.split('.', expand=True)
id_df.columns = ['Project_ID', 'Sample_ID', 'S_Constant', 'Group_ID', 'Sequence_Keyword']
df = df.join(id_df).reset_index(drop=True)
df = df.sort_values('Group_ID')
df = df.drop(['Project_ID', 'Sample_ID', 'S_Constant', 'Sequence_Keyword'], axis=1).reset_index(drop=True)

# drop the Group_ID column
# df = df.drop(['Group_ID'], axis=1)
# df = np.log(df + 1)
# df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/MovingPictures_otu_table_log.csv')
# df

df['Group_ID'] = (pd.factorize(df['Group_ID'])[0] + 1)
df['Group_ID'] = df['Group_ID'].astype(int)

# Apply the filtering
df_selected = filter_informative_columns(df)

# First, find the minimum group size
min_group_size = int(df_selected['Group_ID'].value_counts().mean())

# but min_group_size should be a multiple of the cv constant which is 5
min_group_size = min_group_size - (min_group_size % CV)

# remove all groups that are smaller than the min_group_size
df_selected = df_selected.groupby('Group_ID').filter(lambda x: len(x) >= min_group_size)

# Now, use groupby and sample to get equal-sized samples from each group
equal_samples_df = df_selected.groupby('Group_ID').apply(lambda x: x.sample(min_group_size)).reset_index(drop=True)

# arrange Group_ID in ascending order
equal_samples_df = equal_samples_df.sort_values('Group_ID').reset_index(drop=True)

# Calculate group sparsity and select top groups based on least sparse groups
group_sparsity = equal_samples_df.groupby('Group_ID').apply(
    lambda x: (x.drop('Group_ID', axis=1) == 0).mean().mean()
).sort_values()  # Sort groups by their sparsity

# Take the top N least sparse groups
top_groups = group_sparsity.head(TOP_GRPS).index
equal_samples_df = equal_samples_df[equal_samples_df['Group_ID'].isin(top_groups)]
equal_samples_df

'''
# df_selected is without the Group_ID column
df_selected = df.drop(['Group_ID'], axis=1) 
df_selected = np.log10(df_selected + 1)
df_selected['Group_ID'] = df['Group_ID']
df_selected

# column_sums = df.sum(axis=0).sort_values(ascending=False)
# top_20_columns = column_sums.head(TOP_COLS).index
# df_selected = df[top_20_columns]
# df_selected = np.log10(df_selected + 1)
# df_selected['Group_ID'] = df['Group_ID']

# First, find the minimum group size
min_group_size = int(df_selected['Group_ID'].value_counts().mean())

# but min_group_size should be a multiple of the cv constant which is 5
min_group_size = min_group_size - (min_group_size % CV)

# remove all groups that are smaller than the min_group_size
df_selected = df_selected.groupby('Group_ID').filter(lambda x: len(x) >= min_group_size)

# Now, use groupby and sample to get equal-sized samples from each group
equal_samples_df = df_selected.groupby('Group_ID').apply(lambda x: x.sample(min_group_size)).reset_index(drop=True)

# arrange Group_ID in ascending order
equal_samples_df = equal_samples_df.sort_values('Group_ID').reset_index(drop=True)
equal_samples_df
'''

In [None]:
equal_samples_df

In [8]:
equal_samples_df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/MovingPictures_filtered.csv', index=False)

## qa10394_otu_table

In [None]:
df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/qa10394_otu_table.csv').T
id_df = df.index.to_series().str.split('.', expand=True)
# use only column with index 1
id_df = id_df.iloc[:, 1]
id_df = id_df.rename('Group_ID')
df = df.join(id_df).reset_index(drop=True)
df = df.sort_values('Group_ID')
df = df[(df.Group_ID != "BLANK") & (df.Group_ID != "mistake")]
df = df.reset_index(drop=True)


# drop the Group_ID column
#df = df.drop(['Group_ID'], axis=1)
#df = np.log(df + 1)
#df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/qa10394_otu_table_log.csv')

df['Group_ID'] = (pd.factorize(df['Group_ID'])[0] + 1)
df['Group_ID'] = df['Group_ID'].astype(int)

# Apply the filtering
df_selected = filter_informative_columns(df)

# First, find the minimum group size
min_group_size = int(df_selected['Group_ID'].value_counts().mean())

# but min_group_size should be a multiple of the cv constant which is 5
min_group_size = min_group_size - (min_group_size % CV)

# remove all groups that are smaller than the min_group_size
df_selected = df_selected.groupby('Group_ID').filter(lambda x: len(x) >= min_group_size)

# Now, use groupby and sample to get equal-sized samples from each group
equal_samples_df = df_selected.groupby('Group_ID').apply(lambda x: x.sample(min_group_size)).reset_index(drop=True)

# arrange Group_ID in ascending order
equal_samples_df = equal_samples_df.sort_values('Group_ID').reset_index(drop=True)

# Calculate group sparsity and select top groups based on least sparse groups
group_sparsity = equal_samples_df.groupby('Group_ID').apply(
    lambda x: (x.drop('Group_ID', axis=1) == 0).mean().mean()
).sort_values()  # Sort groups by their sparsity

# Take the top N least sparse groups
top_groups = group_sparsity.head(TOP_GRPS).index
equal_samples_df = equal_samples_df[equal_samples_df['Group_ID'].isin(top_groups)]
equal_samples_df

"""
# df_selected is without the Group_ID column
df_selected = df.drop(['Group_ID'], axis=1) 
df_selected = np.log10(df_selected + 1)
df_selected['Group_ID'] = df['Group_ID']
df_selected

# First, find the minimum group size
min_group_size = int(df_selected['Group_ID'].value_counts().mean())

# but min_group_size should be a multiple of the cv constant which is 5
min_group_size = min_group_size - (min_group_size % CV)

# remove all groups that are smaller than the min_group_size
df_selected = df_selected.groupby('Group_ID').filter(lambda x: len(x) >= min_group_size)

# Now, use groupby and sample to get equal-sized samples from each group
equal_samples_df = df_selected.groupby('Group_ID').apply(lambda x: x.sample(min_group_size)).reset_index(drop=True)

# arrange Group_ID in ascending order
equal_samples_df = equal_samples_df.sort_values('Group_ID').reset_index(drop=True)
equal_samples_df
"""

In [None]:
equal_samples_df

In [12]:
equal_samples_df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/qa10394_filtered.csv', index=False)

## TwinsUK_otu_table

In [8]:
df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/TwinsUK_otu_table.csv').T
id_df = df.index.to_series().str.split('.', expand=True)
id_df = id_df.iloc[:, 1]
id_df = id_df.rename('Group_ID')
df = df.join(id_df).reset_index(drop=True)
df = df.sort_values('Group_ID').reset_index(drop=True)


# drop the Group_ID column
#df = df.drop(['Group_ID'], axis=1)
#df = np.log(df + 1)
#df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/TwinsUK_otu_table_log.csv')
df['Group_ID'] = (pd.factorize(df['Group_ID'])[0] + 1)
df['Group_ID'] = df['Group_ID'].astype(int)

"""
# df_selected is without the Group_ID column
df_selected = df.drop(['Group_ID'], axis=1) 
df_selected = np.log10(df_selected + 1)
df_selected['Group_ID'] = df['Group_ID']
# df_selected

# First, find the minimum group size
min_group_size = int(df_selected['Group_ID'].value_counts().mean())

# but min_group_size should be a multiple of the cv constant which is 5
min_group_size = min_group_size - (min_group_size % CV)

# remove all groups that are smaller than the min_group_size
df_selected = df_selected.groupby('Group_ID').filter(lambda x: len(x) >= min_group_size)

# Now, use groupby and sample to get equal-sized samples from each group
equal_samples_df = df_selected.groupby('Group_ID').apply(lambda x: x.sample(min_group_size)).reset_index(drop=True)

# arrange Group_ID in ascending order
equal_samples_df = equal_samples_df.sort_values('Group_ID').reset_index(drop=True)
equal_samples_df
"""

# Apply the filtering
df_selected = filter_informative_columns(df)

# First, find the minimum group size
min_group_size = int(df_selected['Group_ID'].value_counts().mean())

# but min_group_size should be a multiple of the cv constant which is 5
min_group_size = min_group_size - (min_group_size % CV)

# remove all groups that are smaller than the min_group_size
df_selected = df_selected.groupby('Group_ID').filter(lambda x: len(x) >= min_group_size)

# Now, use groupby and sample to get equal-sized samples from each group
equal_samples_df = df_selected.groupby('Group_ID').apply(lambda x: x.sample(min_group_size)).reset_index(drop=True)

# arrange Group_ID in ascending order
equal_samples_df = equal_samples_df.sort_values('Group_ID').reset_index(drop=True)

# Calculate group sparsity and select top groups based on least sparse groups
group_sparsity = equal_samples_df.groupby('Group_ID').apply(
    lambda x: (x.drop('Group_ID', axis=1) == 0).mean().mean()
).sort_values()  # Sort groups by their sparsity

# Take the top N least sparse groups
top_groups = group_sparsity.head(TOP_GRPS).index
equal_samples_df = equal_samples_df[equal_samples_df['Group_ID'].isin(top_groups)]
equal_samples_df

Original number of columns: 8480
Columns after zero threshold: 948
Final number of columns: 30


Unnamed: 0,4463892,289734,294606,303274,4381553,287445,4447072,3327894,311820,146672,...,301910,287608,329668,337403,4449054,308907,339087,317135,4425214,Group_ID
120,2.797268,2.521138,2.962843,2.622214,2.487138,1.857332,0.778151,3.276232,3.295567,2.068186,...,2.079181,2.585461,2.648360,4.105817,0.477121,2.940018,2.195900,1.869232,2.103804,4
121,1.602060,1.785330,2.178977,2.419956,4.035190,1.785330,0.954243,3.669782,4.045245,2.060698,...,1.662758,0.698970,2.806858,0.954243,2.348305,2.989450,3.068928,2.287802,2.705864,4
122,0.954243,3.288696,3.130012,3.885022,1.322219,0.954243,3.396722,2.004321,3.622421,3.379487,...,0.778151,3.614897,0.845098,1.113943,0.903090,3.115278,3.497206,2.824126,3.153815,4
123,3.571942,2.369216,1.462398,2.588832,1.113943,1.897627,3.673021,2.089905,2.238046,1.924279,...,1.832509,2.775246,2.494155,3.801404,2.060698,0.301030,2.367356,1.602060,0.903090,4
124,3.626443,3.052694,3.166134,0.301030,1.113943,3.658107,2.133539,2.739572,0.954243,2.686636,...,3.235781,3.519959,3.205746,1.146128,2.456366,1.176091,2.718502,1.491362,3.388456,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,2.983626,3.082426,2.726727,3.427324,0.954243,3.054613,3.261976,2.806858,3.285332,2.777427,...,2.547775,1.278754,3.215638,0.698970,2.708421,1.863323,2.712650,1.716003,1.041393,14
596,2.913814,3.949195,2.875061,2.812245,2.958564,2.662758,2.397940,2.525045,3.006038,3.669224,...,2.546543,1.812913,2.431364,3.525045,2.823474,2.521138,2.100371,1.939519,1.000000,14
597,3.804071,2.950851,2.155336,3.044540,2.285557,2.699838,3.187521,2.563481,3.188366,2.911690,...,2.167317,2.247973,2.418301,0.954243,2.890421,2.454845,2.737987,1.681241,0.602060,14
598,2.245513,2.589950,2.571709,2.758912,3.407221,3.317227,3.122216,3.557026,3.571709,3.485011,...,2.665581,3.231470,3.386677,2.466868,1.944483,1.944483,2.636488,1.826075,1.763428,14


In [12]:
equal_samples_df.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/microbe_ds/TwinsUK_filtered.csv', index=False)