In [10]:
import os
import json
import pandas as pd
import requests
from pandas.core.frame import DataFrame

import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline

In [11]:
df_pr1 = pd.read_csv('Data/all_runs_in_project_PRJEB11419.tsv', sep="	" , index_col=False)
df_pr1.describe()

Unnamed: 0,nr. reads sequenced,host age,BMI,longitude,lattitude,QC status
count,14461.0,14461.0,14461.0,14461.0,14461.0,14459.0
mean,32905.3,45.106839,43.835698,-56.134106,37.716693,0.540978
std,39187.68,18.456431,566.117004,64.918977,18.576356,0.498335
min,0.0,0.0,0.0,-166.5,-45.9,0.0
25%,16246.0,34.0,20.78,-110.7,34.1,0.0
50%,24507.0,47.0,23.4,-76.9,40.7,1.0
75%,35872.0,60.0,26.5,-1.0,50.9,1.0
max,1545243.0,96.0,42500.0,175.7,64.9,1.0


In [12]:
df_QC1 = df_pr1.loc[df_pr1['QC status'] == 1.0]
df_phen_depr = df_QC1.loc[df_QC1['associated phenotype'] == 'D012559'] #schizo
df_phen_depr.head()

Unnamed: 0,project ID,run ID,experiment type,instrument model,nr. reads sequenced,host age,sex,BMI,country,longitude,lattitude,associated phenotype,QC status
4,PRJEB11419,ERR1072629,Amplicon,Illumina MiSeq,23830,64,Male,17.67,United States of America,-70.3,41.6,D012559,1.0
43,PRJEB11419,ERR1072937,Amplicon,Illumina MiSeq,27504,53,Male,26.51,United States of America,-116.2,43.6,D012559,1.0
51,PRJEB11419,ERR1073023,Amplicon,Illumina MiSeq,24706,35,Female,34.72,United States of America,-93.3,45.0,D012559,1.0
84,PRJEB11419,ERR1073394,Amplicon,Illumina MiSeq,43579,27,Female,22.41,United States of America,-117.2,32.8,D012559,1.0
91,PRJEB11419,ERR1073395,Amplicon,Illumina MiSeq,45092,27,Female,22.41,United States of America,-117.2,32.8,D012559,1.0


In [13]:
df_runs = df_phen_depr['run ID']
df_runs.describe()

count            354
unique           354
top       ERR1072629
freq               1
Name: run ID, dtype: object

In [14]:
df_only_runs = pd.DataFrame({'Runs': df_runs.values})

In [15]:
df_only_runs.describe()

Unnamed: 0,Runs
count,354
unique,354
top,ERR1072629
freq,1


In [16]:
all_flattened_runs = []

In [17]:


for index, run_id in df_only_runs["Runs"].items():
    try:
        query = {"run_id":run_id}  
        url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
        data = requests.post(url, data=json.dumps(query)).json()

        ## --get run List
        run = data.get("run")

        ## --get DataFrames
        species = DataFrame(data.get("species"))
        species = species.drop(columns = ['taxon_rank_level', 'relative_abundance']) #drop useless info

        #species = species[species['ncbi_taxon_id'] != -1]   #drop missing taxons
        flat = species.set_index('ncbi_taxon_id')['scientific_name'].T
        flat = flat.rename_axis(None).to_frame().T
        flat.insert(0, 'Run', run_id)
        all_flattened_runs.append(species)
    except Exception as e:
        print(f"Error processing run {run}: {e}")



Error processing run {'project_id': 'PRJEB11419', 'original_sample_description': 'American Gut Project Stool Sample which has Attention Deficit Disorder with Hyperactivity,Depression,Bipolar Disorder,Schizophrenia', 'run_id': 'ERR1089866', 'experiment_type': 'Amplicon', 'instrument_model': 'Illumina MiSeq', 'nr_reads_sequenced': 25756, 'host_age': 0, 'sex': 'Male', 'BMI': 0, 'country': 'United States of America', 'longitude': 0, 'latitude': 0, 'loaded_uid': 38181, 'QCStatus': 1, 'QCMessage': '', 'Original_Project_description': 'The American Gut project is the largest crowdsourced citizen science project to date. Fecal, oral, skin, and other body site samples collected from thousands of participants represent the largest human microbiome cohort in existence. Detailed health and lifestyle and diet data associated with each sample is enabling us to deeply examine associations between the human microbiome and factors such as diet (from vegan to near carnivore and everything in between), se

In [18]:
final_df = pd.concat(all_flattened_runs, ignore_index=True)


In [19]:
final_df.head()

Unnamed: 0,ncbi_taxon_id,scientific_name
0,821,Bacteroides vulgatus
1,328813,Alistipes onderdonkii
2,823,Parabacteroides distasonis
3,817,Bacteroides fragilis
4,818,Bacteroides thetaiotaomicron


In [20]:
df = final_df.drop_duplicates(subset='ncbi_taxon_id', keep='first')

In [21]:
df.head()

Unnamed: 0,ncbi_taxon_id,scientific_name
0,821,Bacteroides vulgatus
1,328813,Alistipes onderdonkii
2,823,Parabacteroides distasonis
3,817,Bacteroides fragilis
4,818,Bacteroides thetaiotaomicron


In [22]:
from collections import Counter

def features_in_k_arrays(arrays, k):
    if not arrays:
        return []

    counter = Counter()
    
    for array in arrays:
        unique_strings = set(array)  
        counter.update(unique_strings)
    
    return [string for string, count in counter.items() if count >= k]

In [23]:
five_folds = ['376805', '392736', '1586267', '66219', '339862', '1495', '78344'] 


four_folds = ['40091', '376805', '632', '232270', '28133', '68274', '48466', '47246', '392736', '1586267', '66219', '28108', '36854', '339862', '1495', '78344', '1692', '183', '47880', '46867', '1453594'] 


three_folds = ['40091', '863', '376805', '658457', '632', '47884', '1580', '745368', '232270', '28133', '156974', '68274', '48466', '736', '47246', '392736', '1586267', '47877', '1536', '467976', '66219', '202611', '28108', '36854', '314319', '339862', '544645', '255723', '831', '29346', '1670', '947013', '1495', '999468', '78344', '1692', '183', '47880', '587', '40214', '46867', '1453594', '1613', '40324', '28038', '1304158', '2702', '36842', '930124', '384638', '106648'] 


In [24]:
five_folds_2 = ['183', '45076', '1692', '78344', '255723', '36854'] 


four_folds_2 = ['183', '53462', '1586267', '853', '45076', '68274', '392736', '831', '36842', '1670', '204038', '1692', '78344', '255723', '1536', '216940', '36854', '1495', '863', '376805', '339862'] 


three_folds_2 = ['183', '53462', '40324', '1586267', '584', '1304158', '853', '45076', '46206', '232270', '68274', '28038', '392736', '582', '47877', '831', '1453594', '36842', '1670', '1309', '52769', '204038', '196024', '1692', '78344', '255723', '1536', '216940', '1646377', '36854', '587', '1495', '863', '376805', '339862', '40091', '395922', '66219', '106648', '1343', '147802', '85831', '43997'] 


In [28]:
data = pd.read_csv('Data/Final_data/final_schizo_one_scaled_2')

data = data.iloc[:, 1:]
data.head()

Unnamed: 0,821,328813,823,817,818,239935,28116,351091,28118,1161942,...,78344,1692,68274,40091,1393,28133,183,29346,349096,Schizophrenia
0,0.224972,0.002162,0.001842,0.000801,0.002563,8e-05,0.038603,0.002723,0.002403,0.0,...,0.0,0.0,0.0,8e-05,0.0,0.0,0.0,0.0,8e-05,0
1,0.009963,8.2e-05,0.018865,8.2e-05,0.003022,0.005063,0.00637,0.018375,0.000408,0.034953,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.13169,0.018601,9.3e-05,0.005562,0.013163,0.009362,0.215486,0.009795,0.002101,0.000185,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.1e-05,0.0,0
3,0.106448,0.003627,0.002877,0.000375,0.021202,6.3e-05,0.039402,0.060479,0.000751,0.000125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.175204,0.003135,0.010005,0.003919,0.018581,0.020932,0.045184,0.012679,0.004611,0.002674,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [29]:
filtered_df = df[df['ncbi_taxon_id'].isin(three_folds_2)]

print(filtered_df)

Empty DataFrame
Columns: [ncbi_taxon_id, scientific_name]
Index: []


In [30]:
arrays = [five_folds, five_folds_2]
test = features_in_k_arrays(arrays, 2)
print(test)

['78344']


In [31]:
arrays = [four_folds, four_folds_2]
test_2= features_in_k_arrays(arrays, 2)
print(test_2)

['1495', '78344', '339862', '183', '392736', '68274', '36854', '376805', '1586267', '1692']
