## Install

!pip install  fuzzywuzzy

## Import

In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from tqdm import tqdm
import re

In [2]:
# Load the first CSV file into a Pandas dataframe
df1 = pd.read_csv("./data/output/sample_data_Scrape_Result.csv")
# Load the second CSV file into a second Pandas dataframe
df2 = pd.read_csv("./data/input/Globe-institution_data_result_version_2_Abb.csv")

In [3]:
df1.head(5)

Unnamed: 0.1,Unnamed: 0,Acc,ID,Center_Names,Submitted_by,Biosample_Submission,Bioproject_Submission
0,0,SRR8606903,7314038,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",
1,1,SRR11788653,10848896,CLARK UNIVERSITY,Clark University,"Clark University, Nathan Ahlgren",Clark University
2,2,SRR9158324,7959983,CALIFORNIA STATE UNIVERSITY FULLERTON,California State University Fullerton,"California State University Fullerton, Maria R...",California State University Fullerton
3,3,SRR11881309,10964535,"CALIFORNIA STATE UNIVERSITY, FULLERTON","California State University, Fullerton","California State University, Fullerton, Joshua...","California State University, Fullerton"
4,4,SRR10293930,9209919,"CALIFORNIA STATE UNIVERSITY, LOS ANGELES","California State University, Los Angeles","California State University, Los Angeles, Kirs...","California State University, Los Angeles"


In [4]:
df2.head(5)


Unnamed: 0,Institution_with_abb,country,Institution_withOut_abb,Institution_Abbreviation
0,Australian Institute of Tropical Health and Me...,Australia,Australian Institute of Tropical Health and Me...,AITHM
1,Center for Food Safety and Applied Nutrition (...,United States of America (USA),Center for Food Safety and Applied Nutrition U...,CFSAN
2,Broad Institute of MIT and Harvard,United States of America (USA),Broad Institute of MIT and Harvard,BI
3,Polytechnical University of Kabul,Afghanistan,Polytechnical University of Kabul,
4,Agricultural University of Tirana (UBT),Albania,Agricultural University of Tirana,UBT


## compare_center_names_to_Institution_fullName

In [5]:
def compare_center_names_to_Institution_withOut_abb(df1, df2):
    """
    Compare the 'Center Names' column in df1 to the 'Institution_fullName' column in df2.
    Returns a dataframe with the matched rows from df1 and df2.
    """
    matches = []
    for _, row1 in df1.iterrows():
        # calculate the name score for each row in df1
        df2_with_score = df2.copy()
        df2_with_score['Score'] = df2_with_score['Institution_withOut_abb'].apply(lambda x: fuzz.token_sort_ratio(str(x), str(row1['Center_Names'])))
        # find the row in df2 with the highest name score
        best_match = df2_with_score.loc[df2_with_score['Score']==max(df2_with_score['Score']), :]
        # merge the matched rows from df1 and df2 into a dictionary
        match = row1.to_dict()
        match.update(best_match.to_dict('records')[0])
        matches.append(match)
    # create a new dataframe with the matched rows
    matched_df = pd.DataFrame(matches)
    return matched_df


In [6]:
result = compare_center_names_to_Institution_withOut_abb(df1,df2)
# result.drop_duplicates(['Acc'], inplace=True)
result.head(1)

Unnamed: 0.1,Unnamed: 0,Acc,ID,Center_Names,Submitted_by,Biosample_Submission,Bioproject_Submission,Institution_with_abb,country,Institution_withOut_abb,Institution_Abbreviation,Score
0,0,SRR8606903,7314038,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",,Australian Institute of Marine Science (AIMS),Australia,Australian Institute of Marine Science,AIMS,97


## compare_center_names_to_abbreviation

In [7]:
def compare_center_names_to_Institution_Abbreviation(df1, df2):
    """
    Compare the 'Center Names' column in df1 to the 'Institution_fullName' column in df2.
    Returns a dataframe with the matched rows from df1 and df2.
    """
    matches = []
    for _, row1 in df1.iterrows():
        # calculate the name score for each row in df1
        df2_with_score = df2.copy()
        df2_with_score['Score'] = df2_with_score['Institution_Abbreviation'].apply(lambda x: fuzz.token_sort_ratio(str(x), str(row1['Center_Names'])))
        # find the row in df2 with the highest name score
        best_match = df2_with_score.loc[df2_with_score['Score']==max(df2_with_score['Score']), :]
        # merge the matched rows from df1 and df2 into a dictionary
        match = row1.to_dict()
        match.update(best_match.to_dict('records')[0])
        matches.append(match)
    # create a new dataframe with the matched rows
    matched_df = pd.DataFrame(matches)
    return matched_df

In [8]:
result2 = compare_center_names_to_Institution_Abbreviation(df1,df2)
# result2.drop_duplicates(['Acc'], inplace=True)
result2.head(1)

Unnamed: 0.1,Unnamed: 0,Acc,ID,Center_Names,Submitted_by,Biosample_Submission,Bioproject_Submission,Institution_with_abb,country,Institution_withOut_abb,Institution_Abbreviation,Score
0,0,SRR8606903,7314038,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",,Search for Extraterrestrial Intelligence Insti...,United States of America (USA),Search for Extraterrestrial Intelligence Insti...,SETI Institute,44


## compare_submitted_by_to_Institution_withOut_abb


In [9]:
def compare_submitted_by_to_Institution_withOut_abb(df1, df2):
    """
    Compare the 'Submitted_by' column in df1 to the 'Institution_fullName' column in df2.
    Returns a dataframe with the matched rows from df1 and df2.
    """
    matches = []
    for _, row1 in df1.iterrows():
        # calculate the name score for each row in df1
        df2_with_score = df2.copy()
        df2_with_score['Score'] = df2_with_score['Institution_withOut_abb'].apply(lambda x: fuzz.token_sort_ratio(str(x), str(row1['Submitted_by'])))
        # find the row in df2 with the highest name score
        best_match = df2_with_score.loc[df2_with_score['Score']==max(df2_with_score['Score']), :]
        # merge the matched rows from df1 and df2 into a dictionary
        match = row1.to_dict()
        match.update(best_match.to_dict('records')[0])
        matches.append(match)
    # create a new dataframe with the matched rows
    matched_df = pd.DataFrame(matches)
    return matched_df


In [10]:
result3 = compare_submitted_by_to_Institution_withOut_abb(df1,df2)
# result3.drop_duplicates(['Acc'], inplace=True)
result3.head(1)

Unnamed: 0.1,Unnamed: 0,Acc,ID,Center_Names,Submitted_by,Biosample_Submission,Bioproject_Submission,Institution_with_abb,country,Institution_withOut_abb,Institution_Abbreviation,Score
0,0,SRR8606903,7314038,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",,Australian Institute of Marine Science (AIMS),Australia,Australian Institute of Marine Science,AIMS,97


## compare_submitted_by_to_Institution_Abbreviation

In [11]:
def compare_submitted_by_to_Institution_Abbreviation(df1, df2):
    """
    Compare the 'Submitted_by' column in df1 to the 'Institution_Abbreviation' column in df2.
    Returns a dataframe with the matched rows from df1 and df2.
    """
    matches = []
    for _, row1 in df1.iterrows():
        # calculate the name score for each row in df1
        df2_with_score = df2.copy()
        df2_with_score['Score'] = df2_with_score['Institution_Abbreviation'].apply(lambda x: fuzz.token_sort_ratio(str(x), str(row1['Submitted_by'])))
        # find the row in df2 with the highest name score
        best_match = df2_with_score.loc[df2_with_score['Score']==max(df2_with_score['Score']), :]
        # merge the matched rows from df1 and df2 into a dictionary
        match = row1.to_dict()
        match.update(best_match.to_dict('records')[0])
        matches.append(match)
    # create a new dataframe with the matched rows
    matched_df = pd.DataFrame(matches)
    return matched_df


In [12]:
result4 = compare_submitted_by_to_Institution_Abbreviation(df1,df2)
# result4.drop_duplicates(['Acc'], inplace=True)
result4.head(1)

Unnamed: 0.1,Unnamed: 0,Acc,ID,Center_Names,Submitted_by,Biosample_Submission,Bioproject_Submission,Institution_with_abb,country,Institution_withOut_abb,Institution_Abbreviation,Score
0,0,SRR8606903,7314038,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",,Search for Extraterrestrial Intelligence Insti...,United States of America (USA),Search for Extraterrestrial Intelligence Insti...,SETI Institute,44


## compare_biosample_submission_to_Institution_withOut_abb


In [13]:
def compare_biosample_submission_to_Institution_withOut_abb(df1, df2):
    """
    Compare the 'Biosample_Submission' column in df1 to the 'Institution_fullName' column in df2.
    Returns a dataframe with the matched rows from df1 and df2.
    """
    matches = []
    for _, row1 in df1.iterrows():
        # calculate the name score for each row in df1
        df2_with_score = df2.copy()
        df2_with_score['Score'] = df2_with_score['Institution_withOut_abb'].apply(lambda x: fuzz.token_sort_ratio(str(x), str(row1['Biosample_Submission'])))
        # find the row in df2 with the highest name score
        best_match = df2_with_score.loc[df2_with_score['Score']==max(df2_with_score['Score']), :]
        # merge the matched rows from df1 and df2 into a dictionary
        match = row1.to_dict()
        match.update(best_match.to_dict('records')[0])
        matches.append(match)
    # create a new dataframe with the matched rows
    matched_df = pd.DataFrame(matches)
    return matched_df


In [14]:
result5 = compare_biosample_submission_to_Institution_withOut_abb(df1,df2)
# result5.drop_duplicates(['Acc'], inplace=True)
result5.head(1)

Unnamed: 0.1,Unnamed: 0,Acc,ID,Center_Names,Submitted_by,Biosample_Submission,Bioproject_Submission,Institution_with_abb,country,Institution_withOut_abb,Institution_Abbreviation,Score
0,0,SRR8606903,7314038,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",,Australian Institute of Marine Science (AIMS),Australia,Australian Institute of Marine Science,AIMS,83


## compare_biosample_submission_to_Institution_Abbreviation

In [15]:
def compare_biosample_submission_to_Institution_Abbreviation(df1, df2):
    """
    Compare the 'Biosample_Submission' column in df1 to the 'Institution_Abbreviation' column in df2.
    Returns a dataframe with the matched rows from df1 and df2.
    """
    matches = []
    for _, row1 in df1.iterrows():
        # calculate the name score for each row in df1
        df2_with_score = df2.copy()
        df2_with_score['Score'] = df2_with_score['Institution_Abbreviation'].apply(lambda x: fuzz.token_sort_ratio(str(x), str(row1['Biosample_Submission'])))
        # find the row in df2 with the highest name score
        best_match = df2_with_score.loc[df2_with_score['Score']==max(df2_with_score['Score']), :]
        # merge the matched rows from df1 and df2 into a dictionary
        match = row1.to_dict()
        match.update(best_match.to_dict('records')[0])
        matches.append(match)
    # create a new dataframe with the matched rows
    matched_df = pd.DataFrame(matches)
    return matched_df


In [16]:
result6 = compare_biosample_submission_to_Institution_withOut_abb(df1,df2)
result6.drop_duplicates(['Acc'], inplace=True)
result6.head(1)

Unnamed: 0.1,Unnamed: 0,Acc,ID,Center_Names,Submitted_by,Biosample_Submission,Bioproject_Submission,Institution_with_abb,country,Institution_withOut_abb,Institution_Abbreviation,Score
0,0,SRR8606903,7314038,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",,Australian Institute of Marine Science (AIMS),Australia,Australian Institute of Marine Science,AIMS,83


## compare Bioproject_Submission to Institution_withOut_abb

In [17]:
def compare_Bioproject_Submission_to_Institution_withOut_abb(df1, df2):
    """
    Compare the 'Bioproject_Submission' column in df1 to the 'Institution_fullName' column in df2.
    Returns a dataframe with the matched rows from df1 and df2.
    """
    matches = []
    for _, row1 in df1.iterrows():
        # calculate the name score for each row in df1
        df2_with_score = df2.copy()
        df2_with_score['Score'] = df2_with_score['Institution_withOut_abb'].apply(lambda x: fuzz.token_sort_ratio(str(x), str(row1['Biosample_Submission'])))
        # find the row in df2 with the highest name score
        best_match = df2_with_score.loc[df2_with_score['Score']==max(df2_with_score['Score']), :]
        # merge the matched rows from df1 and df2 into a dictionary
        match = row1.to_dict()
        match.update(best_match.to_dict('records')[0])
        matches.append(match)
    # create a new dataframe with the matched rows
    matched_df = pd.DataFrame(matches)
    return matched_df

In [18]:
result7 = compare_Bioproject_Submission_to_Institution_withOut_abb(df1,df2)
# result7.drop_duplicates(['Acc'], inplace=True)
result7.head(1)

Unnamed: 0.1,Unnamed: 0,Acc,ID,Center_Names,Submitted_by,Biosample_Submission,Bioproject_Submission,Institution_with_abb,country,Institution_withOut_abb,Institution_Abbreviation,Score
0,0,SRR8606903,7314038,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",,Australian Institute of Marine Science (AIMS),Australia,Australian Institute of Marine Science,AIMS,83


## compare Bioproject_Submission to Institution_Abbreviation

In [19]:
def compare_Bioproject_Submission_to_Institution_Abbreviation(df1, df2):
    """
    Compare the 'Bioproject_Submission' column in df1 to the 'Institution_Abbreviation' column in df2.
    Returns a dataframe with the matched rows from df1 and df2.
    """
    matches = []
    for _, row1 in df1.iterrows():
        # calculate the name score for each row in df1
        df2_with_score = df2.copy()
        df2_with_score['Score'] = df2_with_score['Institution_Abbreviation'].apply(lambda x: fuzz.token_sort_ratio(str(x), str(row1['Bioproject_Submission'])))
        # find the row in df2 with the highest name score
        best_match = df2_with_score.loc[df2_with_score['Score']==max(df2_with_score['Score']), :]
        # merge the matched rows from df1 and df2 into a dictionary
        match = row1.to_dict()
        match.update(best_match.to_dict('records')[0])
        matches.append(match)
    # create a new dataframe with the matched rows
    matched_df = pd.DataFrame(matches)
    return matched_df

In [20]:
result8 = compare_Bioproject_Submission_to_Institution_Abbreviation(df1,df2)
# result8.drop_duplicates(['Acc'], inplace=True)
result8.head(1)

Unnamed: 0.1,Unnamed: 0,Acc,ID,Center_Names,Submitted_by,Biosample_Submission,Bioproject_Submission,Institution_with_abb,country,Institution_withOut_abb,Institution_Abbreviation,Score
0,0,SRR8606903,7314038,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",,Polytechnical University of Kabul,Afghanistan,Polytechnical University of Kabul,,100


## Checking Center Name With And WithOut Abbreviation

In [21]:
# match centers based on 'Center Names' and 'Institution_fullName'
matched_centers_Institution_withOut_abb = compare_center_names_to_Institution_withOut_abb(df1, df2)
# match centers based on 'Center Names' and 'Institution_Abbreviation'
matched_centers_Institution_Abbreviation = compare_center_names_to_Institution_Abbreviation(df1, df2)
# match 'Submitted_by' and 'Institution_withOut_abb'
matched_submitted_by_Institution_withOut_abb = compare_submitted_by_to_Institution_withOut_abb(df1, df2)
# match 'Submitted_by' and 'Institution_Abbreviation'
matched_submitted_by_Institution_Abbreviation = compare_submitted_by_to_Institution_Abbreviation(df1, df2)
# match 'Biosample_Submission' and 'Institution_withOut_abb'
matched_biosample_Institution_withOut_abb = compare_biosample_submission_to_Institution_withOut_abb(df1, df2)
# match 'Biosample_Submission' and 'Institution_Abbreviation'
matched_biosample_Institution_Abbreviation = compare_biosample_submission_to_Institution_Abbreviation(df1, df2)

# match 'Bioproject_Submission' and 'Institution_withOut_abb'
matched_Bioproject_Institution_withOut_abb = compare_Bioproject_Submission_to_Institution_withOut_abb(df1, df2)
# match 'Bioproject_Submission' and 'Institution_Abbreviation'
matched_Bioproject_Institution_Abbreviation = compare_Bioproject_Submission_to_Institution_Abbreviation(df1, df2)



## Comparing centers_Institution_withOut_abb and centers_Institution_Abbreviation

In [43]:
def impute_by_score_centers_Name(df1, matched_centers_Institution_withOut_abb, matched_centers_Institution_Abbreviation):
    """
    Impute missing values in df1 using matched_centers_fullName and matched_submitted_by_abb based on the highest Score.
    """
    imputed_data = []
    for _, row1 in df1.iterrows():
        # get the Acc for the current row
        current_acc = row1['Acc']
        # get the rows from matched_centers_fullName and matched_submitted_by_abb that match the current Acc
        matched_centers_Institution_withOut_abb_acc = matched_centers_Institution_withOut_abb[matched_centers_Institution_withOut_abb['Acc'] == current_acc]
        matched_centers_Institution_Abbreviation_acc = matched_centers_Institution_Abbreviation[matched_centers_Institution_Abbreviation['Acc'] == current_acc]
        # check if both matched dataframes are empty, and skip the current Acc if they are
        if matched_centers_Institution_withOut_abb_acc.empty and matched_centers_Institution_Abbreviation_acc.empty:
            continue
        # get the rows with the highest Score in either of the matched dataframes
        highest_score_rows = pd.concat([matched_centers_Institution_withOut_abb_acc, matched_centers_Institution_Abbreviation_acc]).sort_values('Score', ascending=False)
        # check if there are multiple matches with the same score
        num_matches_with_same_score = sum(highest_score_rows['Score'] == highest_score_rows.iloc[0]['Score'])
        flag_multiple_matcher = num_matches_with_same_score + 1
        # add a matches_with_same_score column if applicable
        matches_with_same_score = []
        if flag_multiple_matcher:
            matches_with_same_score = highest_score_rows['Institution_withOut_abb'].tolist()
        # get the row with the highest Score and add it to the imputed data
        highest_score_row = highest_score_rows.iloc[0].copy()
        if highest_score_row['Acc'] in matched_centers_Institution_withOut_abb_acc['Acc'].tolist():
            highest_score_row['Code_pick'] = 'Institution_withOut_abb'
        else:
            highest_score_row['Code_pick'] = 'Institution_Abbreviation'
        imputed_data.append(highest_score_row.to_dict())
        # add the flag_multiple_matcher and matches_with_same_score columns
        imputed_data[-1]['flag_multiple_matcher'] = flag_multiple_matcher
        imputed_data[-1]['matches_with_same_score'] = matches_with_same_score
    # create a new dataframe with the imputed data
    imputed_df = pd.DataFrame(imputed_data, columns=['Acc','ID', 'Center_Names', 'Submitted_by', 'Biosample_Submission', 'Bioproject_Submission', 'Institution_withOut_abb', 'Institution_Abbreviation', 'Score', 'country', 'Code_pick', 'flag_multiple_matcher', 'matches_with_same_score'])
    return imputed_df
    

In [44]:
# filter matched dataframes to only include rows with Score >= 75
matched_centers_Institution_withOut_abb_75 = matched_centers_Institution_withOut_abb[matched_centers_Institution_withOut_abb['Score'] >= 75]
matched_centers_Institution_Abbreviation_75 = matched_centers_Institution_Abbreviation[matched_centers_Institution_Abbreviation['Score'] >= 75]
# impute missing values in df1 based on the highest Score
imputed_df_centers = impute_by_score_centers_Name(df1, matched_centers_Institution_withOut_abb_75, matched_centers_Institution_Abbreviation_75)# print the imputed data
imputed_df_centers.head(1)

Unnamed: 0,Acc,ID,Center_Names,Submitted_by,Biosample_Submission,Bioproject_Submission,Institution_withOut_abb,Institution_Abbreviation,Score,country,Code_pick,flag_multiple_matcher,matches_with_same_score
0,SRR8606903,7314038,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",,Australian Institute of Marine Science,AIMS,97,Australia,Institution_withOut_abb,3,"[Australian Institute of Marine Science, Austr..."


## Comparing submitted_by_Institution_withOut_abb and submitted_by_Institution_Abbreviation

In [45]:
def impute_by_score_submitted_by(df1, matched_submitted_by_Institution_withOut_abb, matched_submitted_by_Institution_Abbreviation):
    """
    Impute missing values in df1 using matched_centers_fullName and matched_submitted_by_abb based on the highest Score.
    """
    imputed_data = []
    for _, row1 in df1.iterrows():
        # get the Acc for the current row
        current_acc = row1['Acc']
        # get the rows from matched_centers_fullName and matched_submitted_by_abb that match the current Acc
        matched_submitted_by_Institution_withOut_abb_acc = matched_submitted_by_Institution_withOut_abb[matched_submitted_by_Institution_withOut_abb['Acc'] == current_acc]
        matched_submitted_by_Institution_Abbreviation_acc = matched_submitted_by_Institution_Abbreviation[matched_submitted_by_Institution_Abbreviation['Acc'] == current_acc]
        # check if both matched dataframes are empty, and skip the current Acc if they are
        if matched_submitted_by_Institution_withOut_abb_acc.empty and matched_submitted_by_Institution_Abbreviation_acc.empty:
            continue
        # get the rows with the highest Score in either of the matched dataframes
        highest_score_rows = pd.concat([matched_submitted_by_Institution_withOut_abb_acc, matched_submitted_by_Institution_Abbreviation_acc]).sort_values('Score', ascending=False)
        # check if there are multiple matches with the same score
        num_matches_with_same_score = sum(highest_score_rows['Score'] == highest_score_rows.iloc[0]['Score'])
        flag_multiple_matcher = num_matches_with_same_score + 1
        # add a matches_with_same_score column if applicable
        matches_with_same_score = []
        if flag_multiple_matcher:
            matches_with_same_score = highest_score_rows['Institution_withOut_abb'].tolist()
        # get the row with the highest Score and add it to the imputed data
        highest_score_row = highest_score_rows.iloc[0].copy()
        if highest_score_row['Acc'] in matched_submitted_by_Institution_withOut_abb_acc['Acc'].tolist():
            highest_score_row['Code_pick'] = 'Institution_withOut_abb'
        else:
            highest_score_row['Code_pick'] = 'Institution_Abbreviation'
        imputed_data.append(highest_score_row.to_dict())
        # add the flag_multiple_matcher and matches_with_same_score columns
        imputed_data[-1]['flag_multiple_matcher'] = flag_multiple_matcher
        imputed_data[-1]['matches_with_same_score'] = matches_with_same_score
    # create a new dataframe with the imputed data
    imputed_df = pd.DataFrame(imputed_data, columns=['Acc', 'ID','Center_Names', 'Submitted_by', 'Biosample_Submission', 'Bioproject_Submission', 'Institution_withOut_abb', 'Institution_Abbreviation', 'Score', 'country', 'Code_pick', 'flag_multiple_matcher', 'matches_with_same_score'])
    return imputed_df
        

In [46]:
matched_submitted_by_Institution_withOut_abb_75 = matched_submitted_by_Institution_withOut_abb[matched_submitted_by_Institution_withOut_abb['Score'] >= 75]
matched_submitted_by_Institution_Abbreviation_75 = matched_submitted_by_Institution_Abbreviation[matched_submitted_by_Institution_Abbreviation['Score'] >= 75]
# impute missing values in df1 based on the highest Score
imputed_df_submitted_by = impute_by_score_submitted_by(df1, matched_submitted_by_Institution_withOut_abb_75, matched_submitted_by_Institution_Abbreviation_75)
# print the imputed data
imputed_df_submitted_by.head(1)

Unnamed: 0,Acc,ID,Center_Names,Submitted_by,Biosample_Submission,Bioproject_Submission,Institution_withOut_abb,Institution_Abbreviation,Score,country,Code_pick,flag_multiple_matcher,matches_with_same_score
0,SRR8606903,7314038,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",,Australian Institute of Marine Science,AIMS,97,Australia,Institution_withOut_abb,3,"[Australian Institute of Marine Science, Austr..."


## Comparing biosample_Institution_withOut_abb and biosample_Institution_Abbreviation

In [47]:
def impute_by_score_biosample(df1, matched_biosample_Institution_withOut_abb, matched_biosample_Institution_Abbreviation):
    imputed_data = []
    for _, row1 in df1.iterrows():
        # get the Acc for the current row
        current_acc = row1['Acc']
        # get the rows from matched_centers_fullName and matched_submitted_by_abb that match the current Acc
        matched_biosample_Institution_withOut_abb_acc = matched_biosample_Institution_withOut_abb[matched_biosample_Institution_withOut_abb['Acc'] == current_acc]
        matched_biosample_Institution_Abbreviation_acc = matched_biosample_Institution_Abbreviation[matched_biosample_Institution_Abbreviation['Acc'] == current_acc]
        # check if both matched dataframes are empty, and skip the current Acc if they are
        if matched_biosample_Institution_withOut_abb_acc.empty and matched_biosample_Institution_Abbreviation_acc.empty:
            continue
        # get the rows with the highest Score in either of the matched dataframes
        highest_score_rows = pd.concat([matched_biosample_Institution_withOut_abb_acc, matched_biosample_Institution_Abbreviation_acc]).sort_values('Score', ascending=False)
        # check if there are multiple matches with the same score
        num_matches_with_same_score = sum(highest_score_rows['Score'] == highest_score_rows.iloc[0]['Score'])
        flag_multiple_matcher = num_matches_with_same_score + 1
        # add a matches_with_same_score column if applicable
        matches_with_same_score = []
        if flag_multiple_matcher:
            matches_with_same_score = highest_score_rows['Institution_withOut_abb'].tolist()
        # get the row with the highest Score and add it to the imputed data
        highest_score_row = highest_score_rows.iloc[0].copy()
        if highest_score_row['Acc'] in matched_biosample_Institution_withOut_abb_acc['Acc'].tolist():
            highest_score_row['Code_pick'] = 'Institution_withOut_abb'
        else:
            highest_score_row['Code_pick'] = 'Institution_Abbreviation'
        imputed_data.append(highest_score_row.to_dict())
        # add the flag_multiple_matcher and matches_with_same_score columns
        imputed_data[-1]['flag_multiple_matcher'] = flag_multiple_matcher
        imputed_data[-1]['matches_with_same_score'] = matches_with_same_score
    # create a new dataframe with the imputed data
    imputed_df = pd.DataFrame(imputed_data, columns=['Acc','ID', 'Center_Names', 'Submitted_by', 'Biosample_Submission', 'Bioproject_Submission', 'Institution_withOut_abb', 'Institution_Abbreviation', 'Score', 'country', 'Code_pick', 'flag_multiple_matcher', 'matches_with_same_score'])
    return imputed_df

In [48]:
matched_biosample_Institution_withOut_abb_75 = matched_biosample_Institution_withOut_abb[matched_biosample_Institution_withOut_abb['Score'] >= 75]
matched_biosample_Institution_Abbreviation_75 = matched_biosample_Institution_Abbreviation[matched_biosample_Institution_Abbreviation['Score'] >= 75]
# impute missing values in df1 based on the highest Score
imputed_df_biosample = impute_by_score_biosample(df1, matched_biosample_Institution_withOut_abb_75, matched_biosample_Institution_withOut_abb_75)
# print the imputed data
imputed_df_biosample.head(1)

Unnamed: 0,Acc,ID,Center_Names,Submitted_by,Biosample_Submission,Bioproject_Submission,Institution_withOut_abb,Institution_Abbreviation,Score,country,Code_pick,flag_multiple_matcher,matches_with_same_score
0,SRR8606903,7314038,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",,Australian Institute of Marine Science,AIMS,83,Australia,Institution_withOut_abb,5,"[Australian Institute of Marine Science, Austr..."


## Comparing Bioproject_Institution_withOut_abb and Bioproject_Institution_Abbreviation

In [49]:
def impute_by_score_Bioproject(df1, matched_Bioproject_Institution_withOut_abb, matched_Bioproject_Institution_Abbreviation):
    imputed_data = []
    for _, row1 in df1.iterrows():
        # get the Acc for the current row
        current_acc = row1['Acc']
        # get the rows from matched_centers_fullName and matched_submitted_by_abb that match the current Acc
        matched_Bioproject_Institution_withOut_abb_acc = matched_Bioproject_Institution_withOut_abb[matched_Bioproject_Institution_withOut_abb['Acc'] == current_acc]
        matched_Bioproject_Institution_Abbreviation_acc = matched_Bioproject_Institution_Abbreviation[matched_Bioproject_Institution_Abbreviation['Acc'] == current_acc]
        # check if both matched dataframes are empty, and skip the current Acc if they are
        if matched_Bioproject_Institution_withOut_abb_acc.empty and matched_Bioproject_Institution_Abbreviation_acc.empty:
            continue
        # get the rows with the highest Score in either of the matched dataframes
        highest_score_rows = pd.concat([matched_Bioproject_Institution_withOut_abb_acc, matched_Bioproject_Institution_Abbreviation_acc]).sort_values('Score', ascending=False)
        # check if there are multiple matches with the same score
        num_matches_with_same_score = sum(highest_score_rows['Score'] == highest_score_rows.iloc[0]['Score'])
        flag_multiple_matcher = num_matches_with_same_score + 1
        # add a matches_with_same_score column if applicable
        matches_with_same_score = []
        if flag_multiple_matcher:
            matches_with_same_score = highest_score_rows['Institution_withOut_abb'].tolist()
        # get the row with the highest Score and add it to the imputed data
        highest_score_row = highest_score_rows.iloc[0].copy()
        if highest_score_row['Acc'] in matched_Bioproject_Institution_withOut_abb_acc['Acc'].tolist():
            highest_score_row['Code_pick'] = 'Institution_withOut_abb'
        else:
            highest_score_row['Code_pick'] = 'Institution_Abbreviation'
        imputed_data.append(highest_score_row.to_dict())
        # add the flag_multiple_matcher and matches_with_same_score columns
        imputed_data[-1]['flag_multiple_matcher'] = flag_multiple_matcher
        imputed_data[-1]['matches_with_same_score'] = matches_with_same_score
    # create a new dataframe with the imputed data
    imputed_df = pd.DataFrame(imputed_data, columns=['Acc', 'Center_Names', 'Submitted_by', 'Biosample_Submission', 'Bioproject_Submission', 'Institution_withOut_abb', 'Institution_Abbreviation', 'Score', 'country', 'Code_pick', 'flag_multiple_matcher', 'matches_with_same_score'])
    return imputed_df

In [50]:

matched_Bioproject_Institution_withOut_abb_75 = matched_Bioproject_Institution_withOut_abb[matched_Bioproject_Institution_withOut_abb['Score'] >= 75]
matched_Bioproject_Institution_Abbreviation_75 = matched_Bioproject_Institution_Abbreviation[matched_Bioproject_Institution_Abbreviation['Score'] >= 75]
# impute missing values in df1 based on the highest Score
imputed_df_Bioproject = impute_by_score_Bioproject(df1, matched_Bioproject_Institution_withOut_abb_75, matched_Bioproject_Institution_Abbreviation_75)
# print the imputed data
imputed_df_Bioproject.head(1)

Unnamed: 0,Acc,Center_Names,Submitted_by,Biosample_Submission,Bioproject_Submission,Institution_withOut_abb,Institution_Abbreviation,Score,country,Code_pick,flag_multiple_matcher,matches_with_same_score
0,SRR8606903,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",,Polytechnical University of Kabul,,100,Afghanistan,Institution_withOut_abb,3,"[Polytechnical University of Kabul, Polytechni..."


## Merge all imputed Comparing dataframes into one

In [51]:
merged_matches = pd.concat([imputed_df_centers,imputed_df_submitted_by, imputed_df_biosample,imputed_df_Bioproject], axis=1)


## drop duplicate columns 


In [52]:
merged_matches = merged_matches.loc[:,~merged_matches.columns.duplicated()]


# filter out rows where Score is less than 75

In [53]:
merged_matches = merged_matches[merged_matches['Score'] >= 75]


# drop duplicates based on Acc

In [54]:
# merged_matches = merged_matches.drop_duplicates(subset=['Acc'])


In [55]:

merged_matches = merged_matches.reset_index(drop=True)

In [56]:
merged_matches = merged_matches[['Acc','ID','Center_Names', 'Submitted_by', 'Biosample_Submission','Bioproject_Submission','Institution_withOut_abb','Institution_Abbreviation','Score','country','Code_pick','flag_multiple_matcher','matches_with_same_score']]
merged_matches = merged_matches.fillna('')

merged_matches.head(50)

Unnamed: 0,Acc,ID,Center_Names,Submitted_by,Biosample_Submission,Bioproject_Submission,Institution_withOut_abb,Institution_Abbreviation,Score,country,Code_pick,flag_multiple_matcher,matches_with_same_score
0,SRR8606903,7314038,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",,Australian Institute of Marine Science,AIMS,97,Australia,Institution_withOut_abb,3,"[Australian Institute of Marine Science, Austr..."
1,SRR11788653,10848896,CLARK UNIVERSITY,Clark University,"Clark University, Nathan Ahlgren",Clark University,Clark University,,100,United States of America (USA),Institution_withOut_abb,2,"[Clark University, Royal Melbourne Institute o..."
2,SRR9158324,7959983,CALIFORNIA STATE UNIVERSITY FULLERTON,California State University Fullerton,"California State University Fullerton, Maria R...",California State University Fullerton,"California State University, Fullerton",CSUF,100,United States of America (USA),Institution_withOut_abb,2,"[California State University, Fullerton]"
3,SRR11881309,10964535,"CALIFORNIA STATE UNIVERSITY, FULLERTON","California State University, Fullerton","California State University, Fullerton, Joshua...","California State University, Fullerton","California State University, Fullerton",CSUF,100,United States of America (USA),Institution_withOut_abb,2,"[California State University, Fullerton]"
4,SRR10293930,9209919,"CALIFORNIA STATE UNIVERSITY, LOS ANGELES","California State University, Los Angeles","California State University, Los Angeles, Kirs...","California State University, Los Angeles","California State University, Los Angeles",CSULA,100,United States of America (USA),Institution_withOut_abb,2,"[California State University, Los Angeles]"
5,SRR8665997,7378698,"CALIFORNIA STATE UNIVERSITY, NORTHRIDGE","California State University, Northridge","California State University, Northridge, Kadir...","California State University, Northridge","California State University, Northridge",CSUN,100,United States of America (USA),Institution_withOut_abb,2,"[California State University, Northridge]"
6,SRR10018586,8900739,CFSAN,FDA Center for Food Safety and Applied Nutrit...,CFSAN,FDA/CFSAN,Center for Food Safety and Applied Nutrition U...,CFSAN,100,United States of America (USA),Institution_Abbreviation,2,[Center for Food Safety and Applied Nutrition ...
7,SRR8606903,7314038,AUSTRALIAN INSTITUTE OF MARINCE SCIENCES,Australian Institute of Marince sciences,"Australian Institute of Marince sciences, Patr...",,Australian Institute of Marine Science,AIMS,97,Australia,Institution_withOut_abb,3,"[Australian Institute of Marine Science, Austr..."
8,SRR10406092,9345478,AUSTRALIAN INSTITUTE OF MARINE SCIENCE,Australian Institute of Marine Science,"Australian Institute of Marine Science, Heidi ...",Australian Institute of Marine Science,Australian Institute of Marine Science,AIMS,100,Australia,Institution_withOut_abb,2,[Australian Institute of Marine Science]
9,SRR10566897,9518689,AUSTRALIAN INSTITUTE OF TROPICAL HEALTH AND ME...,Australian Institute of Tropical Health and M...,Australian Institute of Tropical Health and Me...,Los Alamos National Laboratory,Australian Institute of Tropical Health and Me...,AITHM,100,Australia,Institution_withOut_abb,2,[Australian Institute of Tropical Health and M...


# Split the results into separate dataframes based on the Scores

In [36]:
above_90 = merged_matches[merged_matches['Score'] >= 90]
above_80 = merged_matches[(merged_matches['Score'] >= 80) & (merged_matches['Score'] < 90)]
above_70 = merged_matches[(merged_matches['Score'] >= 70) & (merged_matches['Score'] < 80)]
below_70 = merged_matches[merged_matches['Score'] < 70]
below_69 = below_70[below_70['Score'] < 69]

In [37]:
above_90.count()


Acc                         25
ID                          25
Center_Names                25
Submitted_by                25
Biosample_Submission        25
Bioproject_Submission       25
Institution_withOut_abb     25
Institution_Abbreviation    25
Score                       25
country                     25
Code_pick                   25
flag_multiple_matcher       25
matches_with_same_score     25
dtype: int64

In [38]:
above_80.count()

Acc                         4
ID                          4
Center_Names                4
Submitted_by                4
Biosample_Submission        4
Bioproject_Submission       4
Institution_withOut_abb     4
Institution_Abbreviation    4
Score                       4
country                     4
Code_pick                   4
flag_multiple_matcher       4
matches_with_same_score     4
dtype: int64

In [39]:
above_70.count()

Acc                         1
ID                          1
Center_Names                1
Submitted_by                1
Biosample_Submission        1
Bioproject_Submission       1
Institution_withOut_abb     1
Institution_Abbreviation    1
Score                       1
country                     1
Code_pick                   1
flag_multiple_matcher       1
matches_with_same_score     1
dtype: int64

In [40]:
below_69.count()

Acc                         0
ID                          0
Center_Names                0
Submitted_by                0
Biosample_Submission        0
Bioproject_Submission       0
Institution_withOut_abb     0
Institution_Abbreviation    0
Score                       0
country                     0
Code_pick                   0
flag_multiple_matcher       0
matches_with_same_score     0
dtype: int64

# Save each dataframe to a separate CSV file

In [41]:
above_90.to_csv('./data/output/sample_data_above_90.csv', index=False, encoding='utf-8')
above_80.to_csv('./data/output/sample_data_above_80.csv', index=False, encoding='utf-8')
above_70.to_csv('./data/output/sample_data_above_70.csv', index=False, encoding='utf-8')
below_70.to_csv('./data/output/sample_data_below_69.csv', index=False, encoding='utf-8')

# Save the final result

In [42]:
merged_matches.to_csv('./data/input/sample_data_fuzzy_Scrape_Result.csv', encoding='utf-8')