In [7]:
import pandas as pd

def process_data_files():
    fractions_df = pd.read_csv('../data/Fractions-Bound-Table.txt', sep='\t')
    glycans_df = pd.read_csv('../data/Glycan-Structures-CFG611.txt', sep='\t')
    proteins_df = pd.read_csv('../data/Protein-Sequence-Table.txt', sep='\t')

    merged_df = fractions_df.merge(
        glycans_df[['Name', 'SMILES']], 
        left_on='GlycanID', 
        right_on='Name', 
        how='left'
    )
    
    final_df = merged_df.merge(
        proteins_df[['ProteinGroup', 'Amino Acid Sequence']], 
        on='ProteinGroup',
        how='left'
    )
    
    final_df = final_df.rename(columns={
        'SMILES': 'Glycan SMILE',
        'Amino Acid Sequence': 'Protein Sequence',
        'Concentration': 'concentration',
        'f': 'fraction_bound'
    })
    
    output_df = final_df[[
        'Glycan SMILE',
        'Protein Sequence',
        'concentration',
        'fraction_bound'
    ]].copy()
    
    output_df = output_df.dropna()
    output_path = 'gold_data.csv'
    output_df.to_csv(output_path, index=False)
    
    return output_df


process_data_files()

Unnamed: 0,Glycan SMILE,Protein Sequence,concentration,fraction_bound
0,OC[C@@H](O1)[C@H](O)[C@H](O)[C@@H](O)[C@H]1-OC...,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000000
1,OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H](O)[C@H]1-O...,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000154
2,OC[C@@H](O1)[C@@H](O)[C@H](O)[C@H](O)[C@H]1-OC...,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000082
3,OC[C@@H](O1)[C@H](O)[C@H](O)[C@@H](NC(=O)C)[C@...,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000290
4,OC[C@@H](O1)[C@H](O)[C@H](O)[C@@H](NC(=O)C)[C@...,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000000
...,...,...,...,...
332847,OC[C@@H](O)[C@@H](O)[C@H](O1)[C@H](NC(=O)C)[C@...,ADTIVAVELDSYPNTDIGDPNYPHIGIDIKSIRSKSTARWNMQTGK...,200.000,0.047294
332848,OC[C@@H](O)[C@@H](O)[C@H](O1)[C@H](NC(=O)C)[C@...,ADTIVAVELDSYPNTDIGDPNYPHIGIDIKSIRSKSTARWNMQTGK...,200.000,0.013784
332849,OC[C@@H](O)[C@@H](O)[C@H](O1)[C@H](NC(=O)C)[C@...,ADTIVAVELDSYPNTDIGDPNYPHIGIDIKSIRSKSTARWNMQTGK...,200.000,0.003314
332850,OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H](NC(=O)C)[C...,ADTIVAVELDSYPNTDIGDPNYPHIGIDIKSIRSKSTARWNMQTGK...,200.000,0.001464
