In [2]:
import pandas as pd

def create_token_dict_from_dataframe(df):
    result_dict = {}

    for index, row in df.iterrows():
        ac_id = row['AC_ID']
        pos = row['pos']
        token = row['token']

        if ac_id not in result_dict:
            result_dict[ac_id] = {}

        result_dict[ac_id][pos] = token

    return result_dict


def replace_label(label):
    if 'Phosphotyrosine'.lower() in label.lower():
        return 'Phosphotyrosine'
    elif 'Phosphoserine'.lower() in label.lower():
        return 'Phosphoserine'
    elif 'Phosphothreonine'.lower() in label.lower():
        return 'Phosphothreonine'
    elif 'N6-acetyllysine'.lower() in label.lower():
        return 'N6-acetyllysine'
    else:
        return label


In [3]:
df = pd.read_csv('sprot_labels.csv',index_col=[0])
df.dropna(inplace=True)
df.drop(df.filter(regex="Unnamed"),axis=1, inplace=True)
df['label'] = df['label'].map(replace_label)

df['pos'] = (df['pos'].astype(int) - 1).astype(int)
df['ori_seq'] = df['ori_seq'].str.upper()
df['token'] = '<' + df['label'] + '>'

In [4]:

labels_to_keep = [
    "N-linked (GlcNAc...) asparagine",
    "Pyrrolidone carboxylic acid",
    "Phosphoserine",
    "Phosphothreonine",
    "N-acetylalanine",
    "N-acetylmethionine",
    "N6-acetyllysine",
    "Phosphotyrosine",
    "S-diacylglycerol cysteine",
    "N6-(pyridoxal phosphate)lysine",
    "N-acetylserine",
    "N6-carboxylysine",
    "N6-succinyllysine",
    "S-palmitoyl cysteine",
    "O-(pantetheine 4'-phosphoryl)serine",
    "Phosphotyrosine; by autocatalysis",
    "Sulfotyrosine",
    "O-linked (GalNAc...) threonine",
    "Omega-N-methylarginine",
    "N-myristoyl glycine",
    "4-hydroxyproline",
    "Asymmetric dimethylarginine",
    "N5-methylglutamine",
    "4-aspartylphosphate",
    "S-geranylgeranyl cysteine",
    "4-carboxyglutamate",
]


df = df[df['label'].isin(labels_to_keep)]
df

Unnamed: 0,AC_ID,pos,label,ori_seq,token
7,Q8VBW9,4,N-linked (GlcNAc...) asparagine,MPGQNYSTISEFILFGFSAFPHQMLPALFLLYLLMYLFTLLGNLVI...,<N-linked (GlcNAc...) asparagine>
8,Q60885,4,N-linked (GlcNAc...) asparagine,MGDDNDTDITEFILLGFSGYGFLQGHLFWGVLCIYVVTLLGNSLIV...,<N-linked (GlcNAc...) asparagine>
75,P13744,21,Pyrrolidone carboxylic acid,MARSSLFTFLCLAVFINGCLSQIEQQSPWEFQGSEVWQQHRYQSPR...,<Pyrrolidone carboxylic acid>
89,P48347,64,Phosphoserine,MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV...,<Phosphoserine>
90,P48347,187,Phosphoserine,MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV...,<Phosphoserine>
...,...,...,...,...,...
791254,Q6KAQ7,137,Phosphoserine,MVGTCHSMAASRSTRVTRSTVGLNGLDESFCGRTLRNRSIAHPEEI...,<Phosphoserine>
791255,Q6KAQ7,141,Phosphoserine,MVGTCHSMAASRSTRVTRSTVGLNGLDESFCGRTLRNRSIAHPEEI...,<Phosphoserine>
791256,Q6KAQ7,400,N6-acetyllysine,MVGTCHSMAASRSTRVTRSTVGLNGLDESFCGRTLRNRSIAHPEEI...,<N6-acetyllysine>
791257,Q6KAQ7,612,Phosphoserine,MVGTCHSMAASRSTRVTRSTVGLNGLDESFCGRTLRNRSIAHPEEI...,<Phosphoserine>


In [6]:
df['token'].unique().tolist()


['<N-linked (GlcNAc...) asparagine>',
 '<Pyrrolidone carboxylic acid>',
 '<Phosphoserine>',
 '<Phosphothreonine>',
 '<N-acetylalanine>',
 '<N-acetylmethionine>',
 '<N6-acetyllysine>',
 '<Phosphotyrosine>',
 '<S-diacylglycerol cysteine>',
 '<N6-(pyridoxal phosphate)lysine>',
 '<N-acetylserine>',
 '<N6-carboxylysine>',
 '<N6-succinyllysine>',
 '<S-palmitoyl cysteine>',
 "<O-(pantetheine 4'-phosphoryl)serine>",
 '<Sulfotyrosine>',
 '<O-linked (GalNAc...) threonine>',
 '<Omega-N-methylarginine>',
 '<N-myristoyl glycine>',
 '<4-hydroxyproline>',
 '<Asymmetric dimethylarginine>',
 '<N5-methylglutamine>',
 '<4-aspartylphosphate>',
 '<S-geranylgeranyl cysteine>',
 '<4-carboxyglutamate>']

In [65]:
df['label'].value_counts()


label
Phosphoserine                          113822
N-linked (GlcNAc...) asparagine         99830
Phosphothreonine                        24487
N6-acetyllysine                         17761
Phosphotyrosine                          9480
N6-(pyridoxal phosphate)lysine           6652
4-hydroxyproline                         4202
N-acetylalanine                          3903
N6-succinyllysine                        3257
N-acetylmethionine                       3142
S-palmitoyl cysteine                     2880
Omega-N-methylarginine                   2436
S-diacylglycerol cysteine                2216
N6-carboxylysine                         2131
O-(pantetheine 4'-phosphoryl)serine      1940
N-acetylserine                           1915
O-linked (GalNAc...) threonine           1768
Pyrrolidone carboxylic acid              1603
N5-methylglutamine                       1442
4-carboxyglutamate                       1149
S-geranylgeranyl cysteine                1117
Asymmetric dimethylarginine 

In [34]:
df.drop_duplicates(subset=['ori_seq'], )

Unnamed: 0,AC_ID,pos,label,ori_seq,token
7,Q8VBW9,4,N-linked (GlcNAc...) asparagine,MPGQNYSTISEFILFGFSAFPHQMLPALFLLYLLMYLFTLLGNLVI...,<N-linked (GlcNAc...) asparagine>
8,Q60885,4,N-linked (GlcNAc...) asparagine,MGDDNDTDITEFILLGFSGYGFLQGHLFWGVLCIYVVTLLGNSLIV...,<N-linked (GlcNAc...) asparagine>
75,P13744,21,Pyrrolidone carboxylic acid,MARSSLFTFLCLAVFINGCLSQIEQQSPWEFQGSEVWQQHRYQSPR...,<Pyrrolidone carboxylic acid>
89,P48347,64,Phosphoserine,MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV...,<Phosphoserine>
92,Q9S9Z8,64,Phosphoserine,MENERAKQVYLAKLNEQAERYDEMVEAMKKVAALDVELTIEERNLL...,<Phosphoserine>
...,...,...,...,...,...
791173,Q62523,1,N-acetylalanine,MAAPRPPPAISVSVSAPAFYAPQKKFAPVVAPKPKVNPFRPGDSEP...,<N-acetylalanine>
791190,O43149,239,Phosphoserine,MGNAPSHSSEDEAAAAGGEGWGPHQDWAAVSGTTPGPGVAAPALPP...,<Phosphoserine>
791219,Q5SSH7,239,Phosphoserine,MGNAPSNSSEDEAAAAGGEGWSPHQDWAADSGTTPGPGPAAAVLPS...,<Phosphoserine>
791234,Q8IYH5,81,Phosphoserine,MAASRSTRVTRSTVGLNGLDESFCGRTLRNRSIAHPEEISSNSQVR...,<Phosphoserine>


In [8]:
print("num of sequences: ", len(df['AC_ID'].unique()))

num of sequences:  208520


In [66]:
df.groupby('AC_ID').count()['label'].describe() 

count    86431.000000
mean         3.602295
std          5.313525
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max        263.000000
Name: label, dtype: float64