In [1]:
import os
import json
import pandas as pd
import requests
from pandas.core.frame import DataFrame

import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
df_pr1 = pd.read_csv('Data/all_runs_in_project_PRJEB11419.tsv', sep="	" , index_col=False)
df_pr1.describe()

Unnamed: 0,nr. reads sequenced,host age,BMI,longitude,lattitude,QC status
count,14461.0,14461.0,14461.0,14461.0,14461.0,14459.0
mean,32905.3,45.106839,43.835698,-56.134106,37.716693,0.540978
std,39187.68,18.456431,566.117004,64.918977,18.576356,0.498335
min,0.0,0.0,0.0,-166.5,-45.9,0.0
25%,16246.0,34.0,20.78,-110.7,34.1,0.0
50%,24507.0,47.0,23.4,-76.9,40.7,1.0
75%,35872.0,60.0,26.5,-1.0,50.9,1.0
max,1545243.0,96.0,42500.0,175.7,64.9,1.0


In [3]:
df_QC1 = df_pr1.loc[df_pr1['QC status'] == 1.0]
df_phen_depr = df_QC1.loc[df_QC1['associated phenotype'] == 'D003863'] #depr
df_phen_depr.head()

Unnamed: 0,project ID,run ID,experiment type,instrument model,nr. reads sequenced,host age,sex,BMI,country,longitude,lattitude,associated phenotype,QC status
1,PRJEB11419,ERR1072629,Amplicon,Illumina MiSeq,23830,64,Male,17.67,United States of America,-70.3,41.6,D003863,1.0
42,PRJEB11419,ERR1072937,Amplicon,Illumina MiSeq,27504,53,Male,26.51,United States of America,-116.2,43.6,D003863,1.0
50,PRJEB11419,ERR1073023,Amplicon,Illumina MiSeq,24706,35,Female,34.72,United States of America,-93.3,45.0,D003863,1.0
81,PRJEB11419,ERR1073394,Amplicon,Illumina MiSeq,43579,27,Female,22.41,United States of America,-117.2,32.8,D003863,1.0
88,PRJEB11419,ERR1073395,Amplicon,Illumina MiSeq,45092,27,Female,22.41,United States of America,-117.2,32.8,D003863,1.0


In [4]:
df_runs = df_phen_depr['run ID']
df_runs.describe()

count            354
unique           354
top       ERR1072629
freq               1
Name: run ID, dtype: object

In [5]:
df_only_runs = pd.DataFrame({'Runs': df_runs.values})

In [6]:
df_only_runs.describe()

Unnamed: 0,Runs
count,354
unique,354
top,ERR1072629
freq,1


In [7]:
all_flattened_runs = []

In [9]:


for index, run_id in df_only_runs["Runs"].items():
    try:
        query = {"run_id":run_id}  
        url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
        data = requests.post(url, data=json.dumps(query)).json()

        ## --get run List
        run = data.get("run")

        ## --get DataFrames
        species = DataFrame(data.get("species"))
        species = species.drop(columns = ['taxon_rank_level', 'relative_abundance']) #drop useless info

        #species = species[species['ncbi_taxon_id'] != -1]   #drop missing taxons
        flat = species.set_index('ncbi_taxon_id')['scientific_name'].T
        flat = flat.rename_axis(None).to_frame().T
        flat.insert(0, 'Run', run_id)
        all_flattened_runs.append(species)
    except Exception as e:
        print(f"Error processing run {run}: {e}")



Error processing run {'project_id': 'PRJEB11419', 'original_sample_description': 'American Gut Project Stool Sample which has Depression,Bipolar Disorder,Schizophrenia,Irritable Bowel Syndrome,Diarrhea', 'run_id': 'ERR1089710', 'experiment_type': 'Amplicon', 'instrument_model': 'Illumina MiSeq', 'nr_reads_sequenced': 37941, 'host_age': 63, 'sex': 'Female', 'BMI': 9.4, 'country': 'United States of America', 'longitude': -82.3, 'latitude': 36.3, 'loaded_uid': 38457, 'QCStatus': 1, 'QCMessage': '', 'Original_Project_description': 'The American Gut project is the largest crowdsourced citizen science project to date. Fecal, oral, skin, and other body site samples collected from thousands of participants represent the largest human microbiome cohort in existence. Detailed health and lifestyle and diet data associated with each sample is enabling us to deeply examine associations between the human microbiome and factors such as diet (from vegan to near carnivore and everything in between), se

In [10]:
final_df = pd.concat(all_flattened_runs, ignore_index=True)


In [11]:
final_df.head()

Unnamed: 0,ncbi_taxon_id,scientific_name
0,821,Bacteroides vulgatus
1,328813,Alistipes onderdonkii
2,823,Parabacteroides distasonis
3,817,Bacteroides fragilis
4,818,Bacteroides thetaiotaomicron


In [12]:
df = final_df.drop_duplicates(subset='ncbi_taxon_id', keep='first')

In [13]:
df.head()

Unnamed: 0,ncbi_taxon_id,scientific_name
0,821,Bacteroides vulgatus
1,328813,Alistipes onderdonkii
2,823,Parabacteroides distasonis
3,817,Bacteroides fragilis
4,818,Bacteroides thetaiotaomicron


In [20]:
from collections import Counter

def features_in_k_arrays(arrays, k):
    if not arrays:
        return []

    counter = Counter()
    
    for array in arrays:
        unique_strings = set(array)  
        counter.update(unique_strings)
    
    return [string for string, count in counter.items() if count >= k]

In [14]:
five_folds = ['1692', '1536', '66219', '1580', '78344', '831', '947013', '1495', '339862', '45076', '1304158', '392736', '255723'] 

four_folds =  ['36854', '40091', '1692', '1536', '66219', '1309', '1580', '47877', '232270', '78344', '40324', '831', '658457', '947013', '544645', '1670', '183', '1495', '863', '314319', '339862', '45076', '1304158', '395922', '392736', '363832', '255723', '36842'] 

three_folds = ['36854', '29346', '40091', '1692', '1536', '66219', '1309', '1580', '47877', '232270', '78344', '1776082', '40324', '831', '658457', '947013', '46506', '544645', '332163', '156974', '1670', '52769', '183', '1495', '863', '53462', '314319', '745368', '28108', '339862', '45076', '216940', '1304158', '371601', '395922', '392736', '68274', '363832', '255723', '28038', '1717', '2137', '736', '234908', '1724', '587', '46867', '1530', '36842', '148814', '1586267'] 



five_folds_2 = ['831', '339862', '376805', '66219', '947013', '78344', '40091', '36842', '36854', '392736', '1495', '1692', '1736'] 

four_folds_2 = ['85831', '831', '339862', '376805', '66219', '148814', '947013', '78344', '40091', '36842', '28134', '36854', '183', '392736', '1343', '1309', '1495', '255723', '1692', '1736', '1530', '1586267', '1304158', '1776082', '45076', '232270'] 

three_folds_2 = ['68274', '53462', '85831', '831', '339862', '47246', '376805', '1670', '66219', '148814', '947013', '78344', '2137', '40091', '36842', '28134', '36854', '183', '392736', '1343', '1309', '363832', '29346', '1495', '216940', '255723', '1692', '1736', '1530', '1453594', '1586267', '46867', '1304158', '1776082', '45076', '310298', '232270', '28133', '745368', '47877', '863'] 


In [17]:
filtered_df = df[df['ncbi_taxon_id'].isin(five_folds)]

print(filtered_df)

Empty DataFrame
Columns: [ncbi_taxon_id, scientific_name]
Index: []


In [27]:
filtered_df = df[df['ncbi_taxon_id'].isin(four_folds)]

print(filtered_df)

      ncbi_taxon_id                    scientific_name
90            36842             Clostridium halophilum
230          658457               Pseudomonas composti
241           40324       Stenotrophomonas maltophilia
379          339862         Desulfosporosinus youngiae
418           36854    Desulfitobacterium dehalogenans
908          232270             Effusibacillus pohliae
919            1495         Clostridium cylindrosporum
920          363832               Solirubrobacter soli
922             863              Syntrophomonas wolfei
952          314319       Prolixibacter bellariivorans
957          392736     Uliginosibacterium gangwonense
1046         395922       Coraliomargarita akajimensis
1180           1580               Lactobacillus brevis
1294          47877               Pseudomonas amygdali
1312          45076            Legionella worsleiensis
1494           1309               Streptococcus mutans
1879           1670               Arthrobacter citreus
2308      

In [29]:
filtered_df = df[df['ncbi_taxon_id'].isin(three_folds)]

print(filtered_df)

      ncbi_taxon_id                    scientific_name
33           371601          Bacteroides xylanisolvens
59            46506              Bacteroides stercoris
90            36842             Clostridium halophilum
205           28038             Lactobacillus curvatus
208             736   Haemophilus paraphrohaemolyticus
230          658457               Pseudomonas composti
241           40324       Stenotrophomonas maltophilia
347           46867               Clostridium chauvoei
379          339862         Desulfosporosinus youngiae
391           29346            [Clostridium] paradoxum
418           36854    Desulfitobacterium dehalogenans
589           28108              Alteromonas macleodii
906          148814              Lactobacillus kunkeei
908          232270             Effusibacillus pohliae
919            1495         Clostridium cylindrosporum
920          363832               Solirubrobacter soli
922             863              Syntrophomonas wolfei
952       

In [22]:
arrays = [five_folds, five_folds_2]
test = features_in_k_arrays(arrays, 2)
print(test)

['66219', '78344', '831', '1495', '392736', '947013', '339862', '1692']


In [24]:
arrays = [four_folds, four_folds_2]
test_2= features_in_k_arrays(arrays, 2)
print(test_2)

['36854', '1692', '831', '947013', '183', '339862', '1495', '45076', '36842', '40091', '1304158', '66219', '78344', '255723', '392736', '232270', '1309']
