In [12]:
import pandas as pd

In [11]:
def process_data_files(iupac = False):
    fractions_df = pd.read_csv('../data/Fractions-Bound-Table.txt', sep='\t')
    glycans_df = pd.read_csv('../data/Glycan-Structures-CFG611.txt', sep='\t')
    proteins_df = pd.read_csv('../data/Protein-Sequence-Table.txt', sep='\t')

    glycan_cols = ['Name', 'SMILES']
    if iupac:
        glycan_cols.append('IUPAC')

    merged_df = fractions_df.merge(
        glycans_df[glycan_cols], 
        left_on='GlycanID', 
        right_on='Name', 
        how='left'
    )
    
    final_df = merged_df.merge(
        proteins_df[['ProteinGroup', 'Amino Acid Sequence']], 
        on='ProteinGroup',
        how='left'
    )
    
    final_cols = {
        'SMILES': 'Glycan SMILE',
        'Amino Acid Sequence': 'Protein Sequence',
        'Concentration': 'concentration',
        'f': 'fraction_bound'
    }
    if iupac:
        final_cols['IUPAC'] = 'Glycan IUPAC'
        
    final_df = final_df.rename(columns=final_cols)
    
    output_cols =  [
        'Glycan SMILE',
        'Protein Sequence',
        'concentration',
        'fraction_bound'
    ]
    if iupac:
        output_cols.insert(1, 'Glycan IUPAC')
        
    output_df = final_df[output_cols].copy()
    
    output_df = output_df.dropna()
    output_path = 'gold_data.csv' if not iupac else 'gold_data_iupac.csv'
    output_df.to_csv(output_path, index=False)
    
    return output_df

In [8]:
process_data_files(iupac=False)

Unnamed: 0,Glycan SMILE,Protein Sequence,concentration,fraction_bound
0,OC[C@@H](O1)[C@H](O)[C@H](O)[C@@H](O)[C@H]1-OC...,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000000
1,OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H](O)[C@H]1-O...,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000154
2,OC[C@@H](O1)[C@@H](O)[C@H](O)[C@H](O)[C@H]1-OC...,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000082
3,OC[C@@H](O1)[C@H](O)[C@H](O)[C@@H](NC(=O)C)[C@...,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000290
4,OC[C@@H](O1)[C@H](O)[C@H](O)[C@@H](NC(=O)C)[C@...,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000000
...,...,...,...,...
332847,OC[C@@H](O)[C@@H](O)[C@H](O1)[C@H](NC(=O)C)[C@...,ADTIVAVELDSYPNTDIGDPNYPHIGIDIKSIRSKSTARWNMQTGK...,200.000,0.047294
332848,OC[C@@H](O)[C@@H](O)[C@H](O1)[C@H](NC(=O)C)[C@...,ADTIVAVELDSYPNTDIGDPNYPHIGIDIKSIRSKSTARWNMQTGK...,200.000,0.013784
332849,OC[C@@H](O)[C@@H](O)[C@H](O1)[C@H](NC(=O)C)[C@...,ADTIVAVELDSYPNTDIGDPNYPHIGIDIKSIRSKSTARWNMQTGK...,200.000,0.003314
332850,OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H](NC(=O)C)[C...,ADTIVAVELDSYPNTDIGDPNYPHIGIDIKSIRSKSTARWNMQTGK...,200.000,0.001464


In [9]:
process_data_files(iupac=True)

Unnamed: 0,Glycan SMILE,Glycan IUPAC,Protein Sequence,concentration,fraction_bound
0,OC[C@@H](O1)[C@H](O)[C@H](O)[C@@H](O)[C@H]1-OC...,Gal(α-Sp8,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000000
1,OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H](O)[C@H]1-O...,Glc(α-Sp8,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000154
2,OC[C@@H](O1)[C@@H](O)[C@H](O)[C@H](O)[C@H]1-OC...,Man(α-Sp8,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000082
3,OC[C@@H](O1)[C@H](O)[C@H](O)[C@@H](NC(=O)C)[C@...,GalNAc(α-Sp15,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000290
4,OC[C@@H](O1)[C@H](O)[C@H](O)[C@@H](NC(=O)C)[C@...,GalNAc(α-Sp8,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,0.001,0.000000
...,...,...,...,...,...
332847,OC[C@@H](O)[C@@H](O)[C@H](O1)[C@H](NC(=O)C)[C@...,Neu5Ac(α2-6)Gal(β1-4)GlcNAc(β1-3)Gal(β1-4)GlcN...,ADTIVAVELDSYPNTDIGDPNYPHIGIDIKSIRSKSTARWNMQTGK...,200.000,0.047294
332848,OC[C@@H](O)[C@@H](O)[C@H](O1)[C@H](NC(=O)C)[C@...,Neu5Ac(α2-3)Gal(β1-4)GlcNAc(β1-3)Gal(β1-4)GlcN...,ADTIVAVELDSYPNTDIGDPNYPHIGIDIKSIRSKSTARWNMQTGK...,200.000,0.013784
332849,OC[C@@H](O)[C@@H](O)[C@H](O1)[C@H](NC(=O)C)[C@...,Neu5Ac(α2-6)Gal(β1-4)GlcNAc(β1-3)Gal(β1-4)GlcN...,ADTIVAVELDSYPNTDIGDPNYPHIGIDIKSIRSKSTARWNMQTGK...,200.000,0.003314
332850,OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H](NC(=O)C)[C...,GlcNAc(β1-3)Fuc(α-Sp21,ADTIVAVELDSYPNTDIGDPNYPHIGIDIKSIRSKSTARWNMQTGK...,200.000,0.001464
