In [1]:
import os
import json
import pandas as pd
import requests
from pandas.core.frame import DataFrame

import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
df_pr1 = pd.read_csv('Data/all_runs_in_project_PRJEB11419.tsv', sep="	" , index_col=False)
df_pr1.describe()

Unnamed: 0,nr. reads sequenced,host age,BMI,longitude,lattitude,QC status
count,14461.0,14461.0,14461.0,14461.0,14461.0,14459.0
mean,32905.3,45.106839,43.835698,-56.134106,37.716693,0.540978
std,39187.68,18.456431,566.117004,64.918977,18.576356,0.498335
min,0.0,0.0,0.0,-166.5,-45.9,0.0
25%,16246.0,34.0,20.78,-110.7,34.1,0.0
50%,24507.0,47.0,23.4,-76.9,40.7,1.0
75%,35872.0,60.0,26.5,-1.0,50.9,1.0
max,1545243.0,96.0,42500.0,175.7,64.9,1.0


In [3]:
df_QC1 = df_pr1.loc[df_pr1['QC status'] == 1.0]
df_phen_depr = df_QC1.loc[df_QC1['associated phenotype'] == 'D001714'] #bipolar
df_phen_depr.head()

Unnamed: 0,project ID,run ID,experiment type,instrument model,nr. reads sequenced,host age,sex,BMI,country,longitude,lattitude,associated phenotype,QC status
0,PRJEB11419,ERR1072629,Amplicon,Illumina MiSeq,23830,64,Male,17.67,United States of America,-70.3,41.6,D001714,1.0
40,PRJEB11419,ERR1072937,Amplicon,Illumina MiSeq,27504,53,Male,26.51,United States of America,-116.2,43.6,D001714,1.0
49,PRJEB11419,ERR1073023,Amplicon,Illumina MiSeq,24706,35,Female,34.72,United States of America,-93.3,45.0,D001714,1.0
80,PRJEB11419,ERR1073394,Amplicon,Illumina MiSeq,43579,27,Female,22.41,United States of America,-117.2,32.8,D001714,1.0
87,PRJEB11419,ERR1073395,Amplicon,Illumina MiSeq,45092,27,Female,22.41,United States of America,-117.2,32.8,D001714,1.0


In [4]:
df_runs = df_phen_depr['run ID']
df_runs.describe()

count            354
unique           354
top       ERR1072629
freq               1
Name: run ID, dtype: object

In [5]:
df_only_runs = pd.DataFrame({'Runs': df_runs.values})

In [6]:
df_only_runs.describe()

Unnamed: 0,Runs
count,354
unique,354
top,ERR1072629
freq,1


In [7]:
all_flattened_runs = []

In [8]:


for index, run_id in df_only_runs["Runs"].items():
    try:
        query = {"run_id":run_id}  
        url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
        data = requests.post(url, data=json.dumps(query)).json()

        ## --get run List
        run = data.get("run")

        ## --get DataFrames
        species = DataFrame(data.get("species"))
        species = species.drop(columns = ['taxon_rank_level', 'relative_abundance']) #drop useless info

        #species = species[species['ncbi_taxon_id'] != -1]   #drop missing taxons
        flat = species.set_index('ncbi_taxon_id')['scientific_name'].T
        flat = flat.rename_axis(None).to_frame().T
        flat.insert(0, 'Run', run_id)
        all_flattened_runs.append(species)
    except Exception as e:
        print(f"Error processing run {run}: {e}")



In [9]:
df_pr1 = pd.read_csv('Data/all_runs_in_project_PRJNA485797.tsv', sep="	" , index_col=False)
df_pr1.describe()

Unnamed: 0,nr. reads sequenced,host age,BMI,longitude,lattitude,QC status
count,179.0,179.0,179.0,0.0,0.0,179.0
mean,27708.558659,49.653631,28.125922,,,0.765363
std,7670.763119,14.251853,6.561102,,,0.42496
min,5333.0,23.0,15.95,,,0.0
25%,23613.0,36.5,23.115,,,1.0
50%,28568.0,53.0,26.58,,,1.0
75%,32235.0,60.5,31.695,,,1.0
max,47652.0,87.0,54.08,,,1.0


In [10]:
df_QC1 = df_pr1.loc[df_pr1['QC status'] == 1.0]
df_phen_depr = df_QC1.loc[df_QC1['associated phenotype'] == 'D001714'] #bipolar
df_phen_depr.head()

Unnamed: 0,project ID,run ID,experiment type,instrument model,nr. reads sequenced,host age,sex,BMI,country,longitude,lattitude,associated phenotype,QC status
1,PRJNA485797,SRR7690036,Amplicon,Illumina MiSeq,33248,53,Female,24.96,United States of America,,,D001714,1
4,PRJNA485797,SRR7690039,Amplicon,Illumina MiSeq,29605,41,Female,29.83,United States of America,,,D001714,1
5,PRJNA485797,SRR7690040,Amplicon,Illumina MiSeq,22968,58,Female,38.89,United States of America,,,D001714,1
8,PRJNA485797,SRR7690043,Amplicon,Illumina MiSeq,28728,61,Female,20.67,United States of America,,,D001714,1
9,PRJNA485797,SRR7690044,Amplicon,Illumina MiSeq,36228,24,Female,15.95,United States of America,,,D001714,1


In [11]:
df_runs = df_phen_depr['run ID']
df_runs.describe()

count             91
unique            91
top       SRR7690036
freq               1
Name: run ID, dtype: object

In [12]:
df_only_runs = pd.DataFrame({'Runs': df_runs.values})

In [13]:

for index, run_id in df_only_runs["Runs"].items():
    try:
        query = {"run_id":run_id}  
        url = 'https://gmrepo.humangut.info/api/getFullTaxonomicProfileByRunID'
        data = requests.post(url, data=json.dumps(query)).json()

        ## --get run List
        run = data.get("run")

        ## --get DataFrames
        species = DataFrame(data.get("species"))
        species = species.drop(columns = ['taxon_rank_level', 'relative_abundance']) #drop useless info

        #species = species[species['ncbi_taxon_id'] != -1]   #drop missing taxons
        flat = species.set_index('ncbi_taxon_id')['scientific_name'].T
        flat = flat.rename_axis(None).to_frame().T
        flat.insert(0, 'Run', run_id)
        all_flattened_runs.append(species)
    except Exception as e:
        print(f"Error processing run {run}: {e}")


In [14]:
final_df = pd.concat(all_flattened_runs, ignore_index=True)


In [15]:
final_df.head()

Unnamed: 0,ncbi_taxon_id,scientific_name
0,821,Bacteroides vulgatus
1,328813,Alistipes onderdonkii
2,823,Parabacteroides distasonis
3,817,Bacteroides fragilis
4,818,Bacteroides thetaiotaomicron


In [16]:
df = final_df.drop_duplicates(subset='ncbi_taxon_id', keep='first')

In [17]:
df.head()

Unnamed: 0,ncbi_taxon_id,scientific_name
0,821,Bacteroides vulgatus
1,328813,Alistipes onderdonkii
2,823,Parabacteroides distasonis
3,817,Bacteroides fragilis
4,818,Bacteroides thetaiotaomicron


In [18]:
from collections import Counter

def features_in_k_arrays(arrays, k):
    if not arrays:
        return []

    counter = Counter()
    
    for array in arrays:
        unique_strings = set(array)  
        counter.update(unique_strings)
    
    return [string for string, count in counter.items() if count >= k]

In [19]:
five_folds = ['29346', '232270', '634771', '255723', '392736', '1495', '736', '99807', '638619', '395922', '118967', '66219', '415956', '53972', '46867'] 


four_folds = ['29346', '626947', '232270', '634771', '255723', '1350067', '1655', '392736', '592978', '376805', '1495', '736', '58180', '589437', '339862', '99807', '638619', '36850', '395922', '40091', '118967', '66219', '415956', '53972', '46867', '1244', '156974', '78346', '1501', '831', '1692', '199', '1393'] 


three_folds = ['29346', '626947', '232270', '634771', '255723', '1350067', '1655', '392736', '592978', '305719', '376805', '1495', '33033', '736', '52769', '58180', '589437', '339862', '148814', '99807', '183', '332163', '638619', '36850', '395922', '40091', '43997', '118967', '66219', '415956', '53972', '1071880', '46867', '1536', '1244', '36849', '745368', '156974', '89014', '78346', '1501', '831', '1254', '1692', '199', '1580', '1393', '97084'] 


In [20]:
five_folds_2 = ['183', '415956', '395922', '58180', '736', '29346', '634771', '339862', '314319', '745368', '392736', '36850', '53972', '831', '1350067', '255723', '86958', '1495', '40091', '66219', '99807', '232270', '156974', '1692', '36854', '376805'] 


four_folds_2 = ['183', '415956', '349096', '305719', '395922', '58180', '736', '29346', '1531', '634771', '339862', '314319', '745368', '392736', '36850', '53972', '831', '1350067', '255723', '86958', '1495', '40091', '66219', '99807', '232270', '156974', '1692', '36854', '376805', '1530', '85831', '89014', '1501', '118967'] 


three_folds_2 = ['183', '415956', '349096', '97084', '305719', '395922', '58180', '736', '29346', '89152', '1531', '634771', '339862', '314319', '1536', '745368', '392736', '36850', '53972', '831', '39488', '1350067', '255723', '86958', '1495', '40091', '66219', '99807', '232270', '371601', '156974', '1692', '36854', '376805', '1655', '1530', '85831', '199', '78346', '1596', '89014', '1393', '1501', '853', '118967', '216940', '332163'] 


In [24]:

data = pd.read_csv('Data/Final_data/final_bipolar_one_scaled_2')

data = data.iloc[:, 1:]
data.head()

Unnamed: 0,821,328813,823,817,818,239935,28116,351091,28118,1161942,...,103621,1596,638619,1244,78346,592978,86958,626947,bipolar,Bipolar
0,0.148812,0.001194,0.003473,0.006187,0.015304,0.005753,0.04244,0.054054,0.009226,0.000217,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.097845,0.008707,0.013113,0.0,0.004619,0.01152,0.012423,0.056169,0.001593,0.002495,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.013404,0.009929,0.006619,0.000414,0.020602,0.001158,0.00331,0.012328,0.003392,0.000331,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.082357,0.00327,0.003331,0.0,0.017273,0.005367,0.010117,0.005614,0.003763,0.000123,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.009274,8.4e-05,0.002951,0.011635,0.01037,0.272152,0.063148,0.001349,0.000506,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [25]:
filtered_df = df[df['ncbi_taxon_id'].isin(three_folds_2)]

print(filtered_df)

Empty DataFrame
Columns: [ncbi_taxon_id, scientific_name]
Index: []


In [26]:
arrays = [five_folds, five_folds_2]
test = features_in_k_arrays(arrays, 2)
print(test)

['634771', '415956', '1495', '255723', '392736', '29346', '99807', '395922', '66219', '736', '53972', '232270']


In [27]:
arrays = [four_folds, four_folds_2]
test_2= features_in_k_arrays(arrays, 2)
print(test_2)

['1350067', '395922', '118967', '1692', '634771', '376805', '99807', '53972', '392736', '415956', '36850', '255723', '1501', '29346', '40091', '66219', '232270', '156974', '831', '58180', '736', '1495', '339862']
