### BioMedGraphica ID

In [2]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_microbiota = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Microbiota' / 'BioMedGraphica_Microbiota.csv'
target_dir_phenotype = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Phenotype' / 'BioMedGraphica_Phenotype.csv'
biomedgraphica_microbiota = pd.read_csv(target_dir_microbiota, dtype=str)
biomedgraphica_phenotype = pd.read_csv(target_dir_phenotype, dtype=str)

### gutMDisorder

#### Literature

In [3]:
# Download Link: http://bio-annotation.cn/gutMDisorder/public/res/2.0-literature-associations.xlsx
# Download Date: 2025-03-21
# Download Version: unknown
import pandas as pd

literature = pd.read_excel('2.0-literature-associations.xlsx',sheet_name='Literature', dtype=str)
literature.head()

Unnamed: 0,Index,PMID,Title,Year,Journal,Authors,Research Type,Condition 1,Condition 2,Intervention Type,Human/Mouse
0,1,17897884,Imbalance in intestinal microflora constitutio...,2008,international journal of medical microbiology,"Hiromasa Takaishi, Takahiro Matsuki, Atsushi N...",Gut microbiota associated with phenotype,"Colitis, Ulcerative",Health,,Human
1,2,17897884,Imbalance in intestinal microflora constitutio...,2008,international journal of medical microbiology,"Hiromasa Takaishi, Takahiro Matsuki, Atsushi N...",Gut microbiota associated with phenotype,Crohn Disease,Health,,Human
2,3,17897884,Imbalance in intestinal microflora constitutio...,2008,international journal of medical microbiology,"Hiromasa Takaishi, Takahiro Matsuki, Atsushi N...",Gut microbiota associated with phenotype,"Clinical Deterioration;Colitis,Ulcerative","Remission, Spontaneous;Colitis,Ulcerative",,Human
3,4,17430346,Differences in the composition of intestinal B...,2007,clinical and experimental allergy,"S Suzuki, N Shimojo, Y Tajiri, M Kumemura, Y K...",Gut microbiota associated with phenotype,Hypersensitivity,Health,,Human
4,5,17265126,High proportions of proinflammatory bacteria o...,2007,Digestive Diseases and Sciences,"Mei Wang, Göran Molin, Siv Ahrné, Diya Adawi, ...",Gut microbiota associated with phenotype,"Colitis,Ulcerative",Health,,Human


In [4]:
# Download Link: http://bio-annotation.cn/gutMDisorder/public/res/2.0-literature-associations.xlsx
# Download Date: 2025-03-21
# Download Version: unknown

metadata = pd.read_excel('2.0-literature-associations.xlsx',sheet_name='Metadata', dtype=str)
metadata.head()

Unnamed: 0,Index,Sample Group,Sample Number,Sample Source,"Sex (male,female)",Age,BMI,Nation/Race,Condition,Sequencing Technology,Sequencing Platform
0,1,1,73,stool,4231.0,48 (16-87) years,,Japanese,"Colitis, Ulcerative",16s rRNA gene sequencing,
1,1,2,65,stool,3431.0,37 (25-59) years,,Japanese,Health,16s rRNA gene sequencing,
2,2,1,23,stool,1310.0,37 (26-55) years,,Japanese,Crohn Disease,16s rRNA gene sequencing,
3,2,2,65,stool,3431.0,37 (25-59) years,,Japanese,Health,16s rRNA gene sequencing,
4,3,1,44,stool,,48 (16-87) years,,Japanese,"Clinical Deterioration;Colitis, Ulcerative",16s rRNA gene sequencing,


In [5]:
# Download Link: http://bio-annotation.cn/gutMDisorder/public/res/2.0-literature-associations.xlsx
# Download Date: 2025-03-21
# Download Version: unknown

associations = pd.read_excel('2.0-literature-associations.xlsx',sheet_name='Association', dtype=str)
associations = associations[['Index', 'Gut Microbiata NCBI ID']].drop_duplicates()
associations

Unnamed: 0,Index,Gut Microbiata NCBI ID
0,1,1380
1,1,815
2,1,817
3,1,28116
4,1,818
...,...,...
6410,999,816
6411,999,310297
6412,1000,116085
6413,1001,116085


Name Mapping

In [None]:
from rapidfuzz import fuzz, process
import re

def normalize(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

pheno = biomedgraphica_phenotype.copy()
pheno['col1_clean'] = pheno['Phenotype_Name'].apply(normalize)
metadata['col2_clean'] = metadata['Condition'].apply(normalize)

In [7]:
def substring_match(text1, text2):
    return text1 in text2 or text2 in text1

def custom_match(text1, choices, threshold=85):
    best_match = None
    best_score = 0
    for idx, text2 in enumerate(choices):
        substr_boost = 15 if substring_match(text1, text2) else 0
        score = fuzz.token_sort_ratio(text1, text2) + substr_boost
        if score > best_score:
            best_score = score
            best_match = (text2, score, idx)
    return best_match

matches = []
choices = metadata['col2_clean'].tolist()

for i, text1 in enumerate(pheno['col1_clean']):
    matched_text, score, idx = custom_match(text1, choices)
    matches.append({
        'col1': pheno['Phenotype_Name'][i],
        'matched_col2': metadata['Condition'][idx],
        'similarity_score': score
    })

match_df = pd.DataFrame(matches)
match_df_filtered = match_df[match_df['similarity_score'] >= 85]

In [8]:
match_df_filtered

Unnamed: 0,col1,matched_col2,similarity_score
20,Prostatitis,Prostatitis,115.000000
76,Renal insufficiency,"Renal Insufficiency, Chronic",97.608696
564,Depression,Depression,115.000000
620,Nephrolithiasis,calcium nephrolithiasis,93.947368
645,Diabetes mellitus,Diabetes Mellitus,115.000000
...,...,...,...
17039,Chronic hepatitis,"Hepatitis B, Chronic",94.444444
17897,Intestinal inflammation,Intestinal inflammation,115.000000
18140,Sensory hypersensitivity,Hypersensitivity,95.000000
18142,Auditory hypersensitivity,Hypersensitivity,93.048780


#### Raw

In [None]:
# Download Link: http://bio-annotation.cn/gutMDisorder/public/res/2.0-raw%20data-associations.xlsx
# Download Date: 2025-03-21
# Download Version: unknown