# BioMedGraphica Microbiota

## 1. Data Access  
### Direct Download Links  
**NCBI Taxonomy**: Can be downloaded directly via the link without the need for registration. [Link](https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip)  
**SILVA**: Can be downloaded directly via the link without the need for registration. [Link1](https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/ncbi/taxmap_embl-ebi_ena_lsu_ref_138.2.txt.gz); [Link2](https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/ncbi/taxmap_embl-ebi_ena_ssu_ref_138.2.txt.gz)  
**greengenes**: Can be downloaded directly via the link without the need for registration. [Link](https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/id_mapping/database_mappings/greengenes.tsv)  
**RDP**: Can be downloaded directly via the link without the need for registration. [Link](https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/id_mapping/database_mappings/rdp.tsv)  
**GTDB**: Can be downloaded directly via the link without the need for registration. [Link1](https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz); [Link2](https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz)

## 2. Load Data

### 2.1 NCBI Taxonomy Database

In [4]:
import pandas as pd

ncbi_nodes = pd.read_csv('ncbi_taxonomy/nodes.dmp', sep=r'\t\|\t', header=None, engine='python', dtype=str)

# Replace any instances of '\t|' with an empty string
ncbi_nodes = ncbi_nodes.replace(r'\t\|', '', regex=True)

# Strip any leading or trailing whitespace from string columns
ncbi_nodes = ncbi_nodes.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Assign column names based on the structure of nodes.dmp
ncbi_nodes.columns = [
    'tax_id', 'parent_tax_id', 'rank', 'embl_code', 'division_id',
    'inherited_div_flag', 'genetic_code_id', 'inherited_gc_flag',
    'mitochondrial_genetic_code_id', 'inherited_mgc_flag', 'genbank_hidden_flag',
    'hidden_subtree_root_flag', 'comments'
]

# Display the resulting DataFrame
ncbi_nodes

Unnamed: 0,tax_id,parent_tax_id,rank,embl_code,division_id,inherited_div_flag,genetic_code_id,inherited_gc_flag,mitochondrial_genetic_code_id,inherited_mgc_flag,genbank_hidden_flag,hidden_subtree_root_flag,comments
0,1,1,no rank,,8,0,1,0,0,0,0,0,
1,2,131567,superkingdom,,0,0,11,0,0,0,0,0,
2,6,335928,genus,,0,1,11,1,0,1,0,0,code compliant
3,7,6,species,AC,0,1,11,1,0,1,1,0,code compliant; specified
4,9,32199,species,BA,0,1,11,1,0,1,1,0,code compliant; specified
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2631454,3395349,177862,species,HF,1,1,1,1,5,1,0,0,code compliant; specified
2631455,3395364,1762,no rank,,0,1,11,1,0,1,0,0,
2631456,3395429,2627606,species,MS,4,1,1,1,4,1,0,0,
2631457,3395432,2636820,species,PS,4,1,1,1,4,1,0,0,


Filter Bacteria

In [3]:
ncbi_nodes['division_id'] = ncbi_nodes['division_id'].astype(str)
ncbi_nodes_filter = ncbi_nodes[(ncbi_nodes['division_id'] == '0')]
ncbi_nodes_filter

Unnamed: 0,tax_id,parent_tax_id,rank,embl_code,division_id,inherited_div_flag,genetic_code_id,inherited_gc_flag,mitochondrial_genetic_code_id,inherited_mgc_flag,genbank_hidden_flag,hidden_subtree_root_flag,comments
1,2,131567,superkingdom,,0,0,11,0,0,0,0,0,
2,6,335928,genus,,0,1,11,1,0,1,0,0,code compliant
3,7,6,species,AC,0,1,11,1,0,1,1,0,code compliant; specified
4,9,32199,species,BA,0,1,11,1,0,1,1,0,code compliant; specified
5,10,1706371,genus,,0,1,11,1,0,1,0,0,code compliant
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2631366,3390202,338,species group,,0,1,11,1,0,1,0,0,
2631367,3390273,570,species group,,0,1,11,1,0,1,0,0,
2631421,3393498,670516,species,MR,0,1,11,1,0,1,0,0,specified
2631424,3393728,216572,genus,,0,1,11,1,0,1,0,0,code compliant


In [4]:
ncbi_taxonomy = pd.read_csv('ncbi_taxonomy/names.dmp', sep=r'\t\|\t', header=None, dtype=str)
ncbi_taxonomy = ncbi_taxonomy.replace(r'\t\|', '', regex=True)
ncbi_taxonomy = ncbi_taxonomy.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
ncbi_taxonomy.columns = ['tax_id', 'name_txt', 'unique_name', 'name_class']
ncbi_taxonomy

  ncbi_taxonomy = pd.read_csv('ncbi_taxonomy/names.dmp', sep=r'\t\|\t', header=None, dtype=str)


Unnamed: 0,tax_id,name_txt,unique_name,name_class
0,1,all,,synonym
1,1,root,,scientific name
2,2,Bacteria,Bacteria <bacteria>,scientific name
3,2,bacteria,,blast name
4,2,"""Bacteria"" Cavalier-Smith 1987",,authority
...,...,...,...,...
4296830,3395432,Pholiotina sp. TAC1682a,,synonym
4296831,3395572,"Pholcus ceylonicus Pickard-Cambridge, 1869",,authority
4296832,3395572,Pholcus ceylonicus,,scientific name
4296833,3395572,"Sihala ceylonica (Pickard-Cambridge, 1869)",,authority


In [5]:
ncbi_taxonomy_filter = ncbi_taxonomy[ncbi_taxonomy['name_class'] == 'scientific name']

ncbi_taxonomy_final = ncbi_taxonomy_filter[['tax_id', 'name_txt']]
ncbi_taxonomy_final = ncbi_taxonomy_final[ncbi_taxonomy_final['tax_id'].isin(ncbi_nodes_filter['tax_id'])]
ncbi_taxonomy_final.drop_duplicates(inplace=True)
ncbi_taxonomy_final.reset_index(drop=True, inplace=True)
ncbi_taxonomy_final

Unnamed: 0,tax_id,name_txt
0,2,Bacteria
1,6,Azorhizobium
2,7,Azorhizobium caulinodans
3,9,Buchnera aphidicola
4,10,Cellvibrio
...,...,...
538189,3390202,Xanthomonas translucens group
538190,3390273,Klebsiella pneumoniae complex
538191,3393498,[Mycobacterium] runyonii
538192,3393728,Owariibacterium


### 2.2 SILVA

lsu id

In [6]:
silva_lsu = pd.read_csv('SILVA/taxmap_embl-ebi_ena_lsu_ref_138.2.txt', sep='\t')
silva_lsu_final = silva_lsu[['primaryAccession', 'ncbi_taxonid']]
silva_lsu_final.drop_duplicates(inplace=True)
silva_lsu_final.reset_index(drop=True, inplace=True)
silva_lsu_final.rename(columns={'primaryAccession': 'silva_id'}, inplace=True)
silva_lsu_final

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  silva_lsu_final.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  silva_lsu_final.rename(columns={'primaryAccession': 'silva_id'}, inplace=True)


Unnamed: 0,silva_id,ncbi_taxonid
0,AY835431,160070
1,MF351708,199
2,AB003380,833
3,AC016828,3702
4,AB000109,44689
...,...,...
157868,DQ682621,622
157869,DQ682622,623
157870,DQ682623,624
157871,DQ813266,28110


ssu id

In [7]:
silva_ssu = pd.read_csv('SILVA/taxmap_embl-ebi_ena_ssu_ref_138.2.txt', sep='\t')
silva_ssu_final = silva_ssu[['primaryAccession', 'ncbi_taxonid']]
silva_ssu_final.rename(columns={'primaryAccession': 'silva_id'}, inplace=True)
silva_ssu_final.drop_duplicates(inplace=True)
silva_ssu_final.reset_index(drop=True, inplace=True)
silva_ssu_final

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  silva_ssu_final.rename(columns={'primaryAccession': 'silva_id'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  silva_ssu_final.drop_duplicates(inplace=True)


Unnamed: 0,silva_id,ncbi_taxonid
0,AX664486,7227
1,BD307583,37089
2,BD359735,5858
3,BD359736,5858
4,CS214259,10116
...,...,...
2152597,CRSC01000143,1313
2152598,CRVD01000010,1313
2152599,CRLF01000008,1313
2152600,LT558790,1130089


SILVA - Final Version

In [8]:
silva_final = pd.concat([silva_lsu_final, silva_ssu_final], ignore_index=True)
silva_final['ncbi_taxonid'] = silva_final['ncbi_taxonid'].astype(str).str.replace(r'\D', '', regex=True)
silva_final.drop_duplicates(inplace=True)
silva_final.reset_index(drop=True, inplace=True)
silva_final

Unnamed: 0,silva_id,ncbi_taxonid
0,AY835431,160070
1,MF351708,199
2,AB003380,833
3,AC016828,3702
4,AB000109,44689
...,...,...
2214222,CRDG01000003,1313
2214223,CRBB01000003,1313
2214224,CRSC01000007,1313
2214225,LT558790,1130089


In [9]:
silva_unique = silva_final.groupby('ncbi_taxonid').agg({'silva_id': lambda x: ';'.join(x.unique())}).reset_index()
silva_unique

Unnamed: 0,ncbi_taxonid,silva_id
0,100,FJ572205;FJ572206;FJ572207;FJ572208;FJ572209;A...
1,100000,AF164051
2,100001,AF164052
3,100002,AF164053
4,1000028,HQ391902
...,...,...
272414,999944,HQ324121
272415,999947,HM593015;KC205973
272416,99998,AF164049
272417,999981,HQ330529


### 2.3 Greengenes

In [10]:
greengenes = pd.read_csv('greengenes.tsv', sep='\t', header=None)
greengenes.columns = ['RNAcentral_id', 'database', 'greengenes_id', 'ncbi_taxonid', 'rna type', 'gene name']
greengenes = greengenes[['RNAcentral_id', 'greengenes_id', 'ncbi_taxonid']]
greengenes['greengenes_id'] = greengenes['greengenes_id'].astype(str).str.replace(r'\D', '', regex=True)
greengenes['ncbi_taxonid'] = greengenes['ncbi_taxonid'].astype(str).str.replace(r'\D', '', regex=True)
greengenes.drop_duplicates(inplace=True)
greengenes.reset_index(drop=True, inplace=True)
greengenes

Unnamed: 0,RNAcentral_id,greengenes_id,ncbi_taxonid
0,URS0000000010,2264288,77133
1,URS0000000018,486950,77133
2,URS000000001A,3431066,77133
3,URS000000001E,4439119,155900
4,URS000000001F,253572,77133
...,...,...,...
1144861,URS00008B1FA0,133117,266892
1144862,URS00008B1FA1,192292,77133
1144863,URS00008B1FA2,1108040,77133
1144864,URS00008B1FA3,881490,77133


In [11]:
greengenes_unique = greengenes.groupby('ncbi_taxonid').agg({'greengenes_id': lambda x: ';'.join(x), 'RNAcentral_id': lambda x: ';'.join(x)}).reset_index()
greengenes_unique

Unnamed: 0,ncbi_taxonid,greengenes_id,RNAcentral_id
0,100,4419442;758357;699569;4650;636058;631559;541133,URS000001D3A2;URS0000056995;URS0000071FD2;URS0...
1,100000,7308,URS00004A623C
2,100001,7309,URS00008A7B66
3,100002,7310,URS00002EBCF8
4,1000028,1113734,URS0000887D51
...,...,...,...
92679,999931,1119472;1106878;1114736,URS00000A590F;URS0000572C1F;URS00005890BB
92680,999944,1105486,URS00008A2FE1
92681,99998,7306,URS00000BA22F
92682,999981,1109361,URS000086E3B5


### 2.4 RDP

In [12]:
rdp = pd.read_csv('rdp.tsv', sep='\t', header=None)
rdp.columns = ['RNAcentral_id', 'database', 'rdp_id', 'ncbi_taxonid', 'rna type', 'gene name']
rdp = rdp[['RNAcentral_id', 'rdp_id', 'ncbi_taxonid']]
rdp['ncbi_taxonid'] = rdp['ncbi_taxonid'].astype(str).str.replace(r'\D', '', regex=True)
rdp.drop_duplicates(inplace=True)
rdp.reset_index(drop=True, inplace=True)
rdp

Unnamed: 0,RNAcentral_id,rdp_id,ncbi_taxonid
0,URS00000048FD,S001044215,449447
1,URS000000725B,S002033003,661367
2,URS000000725B,S002033005,661367
3,URS0000018D70,S001587739,573235
4,URS000001A33E,S001548747,484021
...,...,...,...
10297,URS0000789A84,S004064917,1407462
10298,URS0000789A84,S004064918,1407462
10299,URS0000789A84,S004066376,1407463
10300,URS0000789A84,S004066377,1407463


In [13]:
rdp_unique = rdp.groupby('ncbi_taxonid').agg({'rdp_id': lambda x: ';'.join(x), 'RNAcentral_id': lambda x: ';'.join(x)}).reset_index()
rdp_unique

Unnamed: 0,ncbi_taxonid,rdp_id,RNAcentral_id
0,1001534,S004066353;S004066354;S004066355,URS00006E34BD;URS00006E34BD;URS00006E34BD
1,1001542,S004068056;S004068057;S004068058,URS00006E34BD;URS00006E34BD;URS00006E34BD
2,1001582,S004068875;S004068878;S004068877;S004068874;S0...,URS000066A179;URS000066A179;URS000066F0ED;URS0...
3,1001583,S004063727;S004063729;S004063726;S004063725;S0...,URS000067EE59;URS0000789585;URS00007896BA;URS0...
4,1001585,S004064943;S004064944;S004064945;S004064946,URS00006D19DF;URS000070D2C2;URS000070D2C2;URS0...
...,...,...,...
2482,998820,S004066770;S004066767;S004066768;S004066769;S0...,URS00006A3688;URS00006EAC43;URS00006EAC43;URS0...
2483,999378,S004063915;S004063912;S004063913;S004063914;S0...,URS00006A3688;URS00006EAC43;URS00006EAC43;URS0...
2484,999541,S004063592;S004063593;S004064289;S004064290;S0...,URS0000642C4C;URS0000642C4C;URS0000642C4C;URS0...
2485,999552,S004063160;S004063161;S004063162;S004063163;S0...,URS00007895EC;URS00007895EC;URS00007895EC;URS0...


### 2.5 GTDB

Archaea

In [14]:
gtdb_ar = pd.read_csv('GTDB/ar53_metadata_r220.tsv', sep='\t')
gtdb_ar = gtdb_ar[['accession', 'ncbi_taxid']]
gtdb_ar.drop_duplicates(inplace=True)
gtdb_ar.reset_index(drop=True, inplace=True)
gtdb_ar.rename(columns={'accession': 'gtdb_id'}, inplace=True)
gtdb_ar

Unnamed: 0,gtdb_id,ncbi_taxid
0,RS_GCF_000485535.1,795797
1,GB_GCA_030638685.1,2026747
2,GB_GCA_003163595.1,2026739
3,GB_GCA_002782805.1,1974380
4,GB_GCA_939800415.1,2563819
...,...,...
12472,GB_GCA_009889605.1,2026714
12473,GB_GCA_938003605.1,437136
12474,GB_GCA_030699205.1,2026773
12475,RS_GCF_001729385.1,1860099


Bacteria

In [15]:
gtdb_bar = pd.read_csv('GTDB/bac120_metadata_r220.tsv', sep='\t')
gtdb_bar = gtdb_bar[['accession', 'ncbi_taxid']]
gtdb_bar.drop_duplicates(inplace=True)
gtdb_bar.reset_index(drop=True, inplace=True)
gtdb_bar.rename(columns={'accession': 'gtdb_id'}, inplace=True)
gtdb_bar

Unnamed: 0,gtdb_id,ncbi_taxid
0,RS_GCF_000657795.2,1331258
1,RS_GCF_001072555.1,1282
2,RS_GCF_003050715.1,2135698
3,RS_GCF_016772635.1,90371
4,GB_GCA_000615405.1,1236944
...,...,...
584377,GB_GCA_949039885.1,297314
584378,GB_GCA_905234525.1,297314
584379,GB_GCA_948663365.1,2301481
584380,GB_GCA_948940555.1,2301481


GTDB - Final Version

In [16]:
grdb_final = pd.concat([gtdb_ar, gtdb_bar], ignore_index=True)
grdb_final['ncbi_taxid'] = grdb_final['ncbi_taxid'].astype(str).str.replace(r'\D', '', regex=True)
grdb_final.drop_duplicates(inplace=True)
grdb_final.reset_index(drop=True, inplace=True)

gtdb_unique = grdb_final.groupby('ncbi_taxid').agg({'gtdb_id': lambda x: ';'.join(x)}).reset_index()
gtdb_unique

Unnamed: 0,ncbi_taxid,gtdb_id
0,100,GB_GCA_003963445.1;RS_GCF_004339465.1
1,100053,RS_GCF_002009845.1;RS_GCF_002009775.1;RS_GCF_0...
2,1000561,RS_GCF_000220025.3
3,1000562,RS_GCF_000772915.1
4,1000565,RS_GCF_000214035.2
...,...,...
92439,999891,RS_GCF_000195515.1
92440,999892,RS_GCF_000204235.1
92441,999894,GB_GCA_011322625.1;RS_GCF_001652585.1
92442,999898,GB_GCA_001029295.1


## 3. Merge Data

In [None]:
# check duplicates inside the dataframe
def merge_column(df, column1, column2, new_column):
    df[column1] = df[column1].fillna('')
    df[column2] = df[column2].fillna('')
    df[new_column] = df.apply(lambda row: f"{row[column1]} {row[column2]}".strip(), axis=1)

    expanded_rows = df[new_column].str.split(expand=True).stack().reset_index(level=1, drop=True)
    expanded_rows.name = new_column

    df = df.drop(columns=[new_column]).join(expanded_rows)
    df.drop(columns=[column1, column2], inplace=True)
    df.drop_duplicates(inplace=True)
    
    return df

def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df
    

### 3.1 NCBI + SILVA

In [18]:
ncbi_taxonomy_final = ncbi_taxonomy_final.rename(columns={'tax_id': 'ncbi_taxonid', 'name_txt': 'name'})

ncbi_silva = pd.merge(ncbi_taxonomy_final, silva_unique, on='ncbi_taxonid', how='outer')
ncbi_silva

Unnamed: 0,ncbi_taxonid,name,silva_id
0,10,Cellvibrio,
1,100,Ancylobacter aquaticus,FJ572205;FJ572206;FJ572207;FJ572208;FJ572209;A...
2,100000,Herbaspirillum sp. BA12,AF164051
3,1000000,Microbacterium sp. 6.11-VPa,
4,1000001,Mycobacterium sp. 1.1-VEs,
...,...,...,...
620523,999995,Arthrobacter sp. 8.25ST-VSa,
620524,999996,Arthrobacter sp. 9.22-BP,
620525,999997,Arthrobacter sp. 9.29ST-BP,
620526,999998,Clavibacter sp. 10.27ST-Bb,


### 3.2 Add Greengene

In [19]:
ncbi_silva_gg = pd.merge(ncbi_silva, greengenes_unique, on='ncbi_taxonid', how='outer')
ncbi_silva_gg

Unnamed: 0,ncbi_taxonid,name,silva_id,greengenes_id,RNAcentral_id
0,10,Cellvibrio,,,
1,100,Ancylobacter aquaticus,FJ572205;FJ572206;FJ572207;FJ572208;FJ572209;A...,4419442;758357;699569;4650;636058;631559;541133,URS000001D3A2;URS0000056995;URS0000071FD2;URS0...
2,100000,Herbaspirillum sp. BA12,AF164051,7308,URS00004A623C
3,1000000,Microbacterium sp. 6.11-VPa,,,
4,1000001,Mycobacterium sp. 1.1-VEs,,,
...,...,...,...,...,...
621080,999995,Arthrobacter sp. 8.25ST-VSa,,,
621081,999996,Arthrobacter sp. 9.22-BP,,,
621082,999997,Arthrobacter sp. 9.29ST-BP,,,
621083,999998,Clavibacter sp. 10.27ST-Bb,,,


### 3.3 Add RDP

In [20]:
ncbi_silva_gg_rdp = pd.merge(ncbi_silva_gg, rdp_unique, on='ncbi_taxonid', how='outer')
ncbi_silva_gg_rdp = merge_string_columns(ncbi_silva_gg_rdp, ['RNAcentral_id_x', 'RNAcentral_id_y'], 'RNAcentral_id')
ncbi_silva_gg_rdp = ncbi_silva_gg_rdp.replace('', pd.NA)
ncbi_silva_gg_rdp

Unnamed: 0,ncbi_taxonid,name,silva_id,greengenes_id,rdp_id,RNAcentral_id
0,10,Cellvibrio,,,,
1,100,Ancylobacter aquaticus,FJ572205;FJ572206;FJ572207;FJ572208;FJ572209;A...,4419442;758357;699569;4650;636058;631559;541133,,URS000001D3A2;URS0000056995;URS0000071FD2;URS0...
2,100000,Herbaspirillum sp. BA12,AF164051,7308,,URS00004A623C
3,1000000,Microbacterium sp. 6.11-VPa,,,,
4,1000001,Mycobacterium sp. 1.1-VEs,,,,
...,...,...,...,...,...,...
621086,999995,Arthrobacter sp. 8.25ST-VSa,,,,
621087,999996,Arthrobacter sp. 9.22-BP,,,,
621088,999997,Arthrobacter sp. 9.29ST-BP,,,,
621089,999998,Clavibacter sp. 10.27ST-Bb,,,,


### 3.4 Add GTDB

In [21]:
gtdb_unique = gtdb_unique.rename(columns={'ncbi_taxid': 'ncbi_taxonid'})
ncbi_silva_gg_rdp_grdb = pd.merge(ncbi_silva_gg_rdp, gtdb_unique, on='ncbi_taxonid', how='outer')
ncbi_silva_gg_rdp_grdb

Unnamed: 0,ncbi_taxonid,name,silva_id,greengenes_id,rdp_id,RNAcentral_id,gtdb_id
0,10,Cellvibrio,,,,,
1,100,Ancylobacter aquaticus,FJ572205;FJ572206;FJ572207;FJ572208;FJ572209;A...,4419442;758357;699569;4650;636058;631559;541133,,URS000001D3A2;URS0000056995;URS0000071FD2;URS0...,GB_GCA_003963445.1;RS_GCF_004339465.1
2,100000,Herbaspirillum sp. BA12,AF164051,7308,,URS00004A623C,
3,1000000,Microbacterium sp. 6.11-VPa,,,,,
4,1000001,Mycobacterium sp. 1.1-VEs,,,,,
...,...,...,...,...,...,...,...
621877,999995,Arthrobacter sp. 8.25ST-VSa,,,,,
621878,999996,Arthrobacter sp. 9.22-BP,,,,,
621879,999997,Arthrobacter sp. 9.29ST-BP,,,,,
621880,999998,Clavibacter sp. 10.27ST-Bb,,,,,


### 3.5 Fill the Name

In [22]:
ncbi_taxonomy_name = ncbi_taxonomy_filter[['tax_id', 'name_txt']]
ncbi_silva_gg_rdp_grdb_final = pd.merge(ncbi_silva_gg_rdp_grdb, ncbi_taxonomy_name, left_on='ncbi_taxonid', right_on='tax_id', how='left')
ncbi_silva_gg_rdp_grdb_final = merge_string_columns(ncbi_silva_gg_rdp_grdb_final, ['name_txt', 'name'], 'NCBI_Taxonomy_Name')
ncbi_silva_gg_rdp_grdb_final = merge_column(ncbi_silva_gg_rdp_grdb_final, 'ncbi_taxonid', 'tax_id', 'NCBI_Taxonomy_ID')
ncbi_silva_gg_rdp_grdb_final

Unnamed: 0,silva_id,greengenes_id,rdp_id,RNAcentral_id,gtdb_id,NCBI_Taxonomy_Name,NCBI_Taxonomy_ID
0,,,,,,Cellvibrio,10
1,FJ572205;FJ572206;FJ572207;FJ572208;FJ572209;A...,4419442;758357;699569;4650;636058;631559;541133,,URS000001D3A2;URS0000056995;URS0000071FD2;URS0...,GB_GCA_003963445.1;RS_GCF_004339465.1,Ancylobacter aquaticus,100
2,AF164051,7308,,URS00004A623C,,Herbaspirillum sp. BA12,100000
3,,,,,,Microbacterium sp. 6.11-VPa,1000000
4,,,,,,Mycobacterium sp. 1.1-VEs,1000001
...,...,...,...,...,...,...,...
621877,,,,,,Arthrobacter sp. 8.25ST-VSa,999995
621878,,,,,,Arthrobacter sp. 9.22-BP,999996
621879,,,,,,Arthrobacter sp. 9.29ST-BP,999997
621880,,,,,,Clavibacter sp. 10.27ST-Bb,999998


In [23]:
ncbi_silva_gg_rdp_grdb_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 621882 entries, 0 to 621881
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   silva_id            272419 non-null  object
 1   greengenes_id       92684 non-null   object
 2   rdp_id              2487 non-null    object
 3   RNAcentral_id       93118 non-null   object
 4   gtdb_id             92444 non-null   object
 5   NCBI_Taxonomy_Name  621882 non-null  object
 6   NCBI_Taxonomy_ID    621882 non-null  object
dtypes: object(7)
memory usage: 38.0+ MB


## 4. BioMedgraphica ID

In [26]:
biomedgraphica_microbiota = ncbi_silva_gg_rdp_grdb_final.copy()
biomedgraphica_microbiota = biomedgraphica_microbiota.sort_values(by=['NCBI_Taxonomy_ID'], na_position='last').reset_index(drop=True)

max_length = len(str(len(biomedgraphica_microbiota)))
biomedgraphica_microbiota['BioMedGraphica_ID'] = ['BMG_MC' + str(i).zfill(max_length) for i in range(1, len(biomedgraphica_microbiota) + 1)]
biomedgraphica_microbiota.rename(columns={'RNAcentral_id': 'RNAcentral_ID', 'silva_id': 'SILVA_ID',
                                       'greengenes_id': 'Greengenes_ID', 'rdp_id': 'RDP_ID', 'gtdb_id': 'GTDB_ID'}, inplace=True)
columns = ['BioMedGraphica_ID'] + [col for col in biomedgraphica_microbiota.columns if col != 'BioMedGraphica_ID']  # re-order columns
biomedgraphica_microbiota = biomedgraphica_microbiota[columns]
biomedgraphica_microbiota.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 621882 entries, 0 to 621881
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   BioMedGraphica_ID   621882 non-null  object
 1   SILVA_ID            272419 non-null  object
 2   Greengenes_ID       92684 non-null   object
 3   RDP_ID              2487 non-null    object
 4   RNAcentral_ID       93118 non-null   object
 5   GTDB_ID             92444 non-null   object
 6   NCBI_Taxonomy_Name  621882 non-null  object
 7   NCBI_Taxonomy_ID    621882 non-null  object
dtypes: object(8)
memory usage: 38.0+ MB


In [27]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Microbiota'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Microbiota.csv'
biomedgraphica_microbiota.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Microbiota\BioMedGraphica_Microbiota.csv


## 5. Description(NULL)

## 6. File Generation

In [1]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Microbiota' / 'BioMedGraphica_Microbiota.csv'
biomedgraphica_microbiota = pd.read_csv(target_dir, dtype=str)

### 6.1 Name and ID

GUI Name

In [2]:
gui_name = biomedgraphica_microbiota.copy()
gui_name.rename(columns={'NCBI_Taxonomy_Name': 'Microbiota_Name_List'}, inplace=True)
gui_name = gui_name[['BioMedGraphica_ID', 'Microbiota_Name_List']]
gui_name

Unnamed: 0,BioMedGraphica_ID,Microbiota_Name_List
0,BMG_MC000001,Cellvibrio
1,BMG_MC000002,Ancylobacter aquaticus
2,BMG_MC000003,Herbaspirillum sp. BA12
3,BMG_MC000004,Microbacterium sp. 6.11-VPa
4,BMG_MC000005,Mycobacterium sp. 1.1-VEs
...,...,...
621877,BMG_MC621878,Arthrobacter sp. 8.25ST-VSa
621878,BMG_MC621879,Arthrobacter sp. 9.22-BP
621879,BMG_MC621880,Arthrobacter sp. 9.29ST-BP
621880,BMG_MC621881,Clavibacter sp. 10.27ST-Bb


In [3]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Microbiota'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Microbiota_GUI_Name.csv'
gui_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Microbiota\BioMedGraphica_Microbiota_GUI_Name.csv


LLM Name and ID

In [4]:
llm_name_id = biomedgraphica_microbiota.copy()

llm_name_id['NCBI_Taxonomy_ID'] = llm_name_id['NCBI_Taxonomy_ID'].apply(
    lambda x: ' | '.join(f"NCBI Taxonomy ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['SILVA_ID'] = llm_name_id['SILVA_ID'].apply(
    lambda x: ' | '.join(f"SILVA ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['Greengenes_ID'] = llm_name_id['Greengenes_ID'].apply(
    lambda x: ' | '.join(f"Greengenes ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['RDP_ID'] = llm_name_id['RDP_ID'].apply(
    lambda x: ' | '.join(f"RDP ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['RNAcentral_ID'] = llm_name_id['RNAcentral_ID'].apply(
    lambda x: ' | '.join(f"RNAcentral ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['GTDB_ID'] = llm_name_id['GTDB_ID'].apply(
    lambda x: ' | '.join(f"GTDB ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

columns = ['BioMedGraphica_ID', 'NCBI_Taxonomy_Name', 'NCBI_Taxonomy_ID', 'SILVA_ID', 'Greengenes_ID', 'RNAcentral_ID', 'GTDB_ID', 'RDP_ID']
llm_name_id = llm_name_id[columns]
llm_name_id

Unnamed: 0,BioMedGraphica_ID,NCBI_Taxonomy_Name,NCBI_Taxonomy_ID,SILVA_ID,Greengenes_ID,RNAcentral_ID,GTDB_ID,RDP_ID
0,BMG_MC000001,Cellvibrio,NCBI Taxonomy ID:10,,,,,
1,BMG_MC000002,Ancylobacter aquaticus,NCBI Taxonomy ID:100,SILVA ID:FJ572205 | SILVA ID:FJ572206 | SILVA ...,Greengenes ID:4419442 | Greengenes ID:758357 |...,RNAcentral ID:URS000001D3A2 | RNAcentral ID:UR...,GTDB ID:GB_GCA_003963445.1 | GTDB ID:RS_GCF_00...,
2,BMG_MC000003,Herbaspirillum sp. BA12,NCBI Taxonomy ID:100000,SILVA ID:AF164051,Greengenes ID:7308,RNAcentral ID:URS00004A623C,,
3,BMG_MC000004,Microbacterium sp. 6.11-VPa,NCBI Taxonomy ID:1000000,,,,,
4,BMG_MC000005,Mycobacterium sp. 1.1-VEs,NCBI Taxonomy ID:1000001,,,,,
...,...,...,...,...,...,...,...,...
621877,BMG_MC621878,Arthrobacter sp. 8.25ST-VSa,NCBI Taxonomy ID:999995,,,,,
621878,BMG_MC621879,Arthrobacter sp. 9.22-BP,NCBI Taxonomy ID:999996,,,,,
621879,BMG_MC621880,Arthrobacter sp. 9.29ST-BP,NCBI Taxonomy ID:999997,,,,,
621880,BMG_MC621881,Clavibacter sp. 10.27ST-Bb,NCBI Taxonomy ID:999998,,,,,


In [5]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Microbiota'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Microbiota_LLM_Name_ID.csv'
llm_name_id.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Microbiota\BioMedGraphica_Microbiota_LLM_Name_ID.csv


LLM Name and ID Combined

In [6]:
llm_combined = llm_name_id.copy()

def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

llm_combined = merge_string_columns(llm_combined, llm_combined.columns[llm_combined.columns != 'BioMedGraphica_ID'], 'Names_and_IDs')
llm_combined

Unnamed: 0,BioMedGraphica_ID,Names_and_IDs
0,BMG_MC000001,Cellvibrio | NCBI Taxonomy ID:10
1,BMG_MC000002,Ancylobacter aquaticus | SILVA ID:FJ572207 | S...
2,BMG_MC000003,RNAcentral ID:URS00004A623C | SILVA ID:AF16405...
3,BMG_MC000004,NCBI Taxonomy ID:1000000 | Microbacterium sp. ...
4,BMG_MC000005,NCBI Taxonomy ID:1000001 | Mycobacterium sp. 1...
...,...,...
621877,BMG_MC621878,NCBI Taxonomy ID:999995 | Arthrobacter sp. 8.2...
621878,BMG_MC621879,NCBI Taxonomy ID:999996 | Arthrobacter sp. 9.2...
621879,BMG_MC621880,Arthrobacter sp. 9.29ST-BP | NCBI Taxonomy ID:...
621880,BMG_MC621881,NCBI Taxonomy ID:999998 | Clavibacter sp. 10.2...


In [7]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Microbiota'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Microbiota_LLM_Name_ID_Combined.csv'
llm_combined.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Microbiota\BioMedGraphica_Microbiota_LLM_Name_ID_Combined.csv


Display Name

In [8]:
display_name = biomedgraphica_microbiota.copy()

display_name.rename(columns={'NCBI_Taxonomy_Name': 'BMG_Microbiota_Name'}, inplace=True)
display_name = display_name[['BioMedGraphica_ID', 'BMG_Microbiota_Name']]
display_name

Unnamed: 0,BioMedGraphica_ID,BMG_Microbiota_Name
0,BMG_MC000001,Cellvibrio
1,BMG_MC000002,Ancylobacter aquaticus
2,BMG_MC000003,Herbaspirillum sp. BA12
3,BMG_MC000004,Microbacterium sp. 6.11-VPa
4,BMG_MC000005,Mycobacterium sp. 1.1-VEs
...,...,...
621877,BMG_MC621878,Arthrobacter sp. 8.25ST-VSa
621878,BMG_MC621879,Arthrobacter sp. 9.22-BP
621879,BMG_MC621880,Arthrobacter sp. 9.29ST-BP
621880,BMG_MC621881,Clavibacter sp. 10.27ST-Bb


In [9]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Microbiota'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Microbiota_Display_Name.csv'
display_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Microbiota\BioMedGraphica_Microbiota_Display_Name.csv
