### Analysis for Linear model

# 1. Loading the data

In [20]:
import pandas as pd
import utils
import re
import numpy as np

In [23]:
# Load data
DATA_PATH = './data/merged_data.tsv' 
data = pd.read_csv(DATA_PATH, sep='\t')

In [24]:
data

Unnamed: 0.1,Unnamed: 0,EPSD ID,AA,Position,phos_site_cnt,seq,seq_len
0,0,EP0000001,"['T', 'S', 'S', 'S']","[138, 139, 206, 207]",4,[12 10 2 15 19 9 11 1 15 0 10 4 1 13 11 13 16 ...,511
1,1,EP0000002,"['S', 'S', 'S', 'S']","[163, 485, 498, 53]",4,[12 15 15 0 2 10 15 9 11 6 2 7 0 12 19 19 13 7...,500
2,2,EP0000004,"['S', 'S']","[17, 18]",2,[12 3 15 3 12 5 2 5 2 14 8 16 2 15 11 2 15 15 ...,176
3,3,EP0000005,"['S', 'Y', 'T', 'T', 'S', 'S', 'S']","[117, 29, 36, 49, 54, 69, 86]",7,[12 14 1 0 13 10 19 1 15 1 1 14 5 14 14 2 17 7...,214
4,4,EP0000006,"['Y', 'S', 'Y', 'S', 'S']","[20, 355, 358, 41, 70]",5,[12 9 14 10 10 10 0 0 10 10 4 19 14 0 7 0 10 1...,360
...,...,...,...,...,...,...,...
103686,103686,EP0209260,"['T', 'S']","[19, 21]",2,[12 7 3 1 15 19 3 6 14 0 19 7 4 15 3 3 0 6 16 ...,801
103687,103687,EP0209262,['S'],[166],1,[12 5 16 7 1 14 14 14 13 13 11 5 12 8 18 14 8 ...,861
103688,103688,EP0209265,"['T', 'S', 'S', 'S', 'S', 'S', 'T', 'T', 'S', ...","[1069, 1076, 1091, 1094, 1598, 1599, 1603, 160...",11,[12 14 3 15 9 16 2 7 7 1 14 14 0 14 14 15 15 1...,1850
103689,103689,EP0209266,"['T', 'S', 'S', 'S', 'S', 'T', 'S', 'S', 'T', ...","[1028, 1029, 1030, 1031, 1037, 1039, 1065, 106...",19,[12 6 3 11 16 15 11 6 11 6 11 15 0 3 11 1 9 3 ...,2318


### Determining positive weight for BCLoss

In [14]:
# Determining positive weight for BCloss

def count_STY(seq):
    '''
    Count number of possible phosphorelations in a protein w.r.t AA
    
    Parameters
    ----------
    seq : pd.Series
        sequence of AA
    
    Returns
    -------
    int
        Number of AA where phosphoreation may occurr
    '''
    seq = re.findall(r'\d+', seq)
    seq = list(map(int, seq))
    count_S = (np.array(seq) == utils.aa_to_ord('S')).sum()
    count_T = (np.array(seq) == utils.aa_to_ord('T')).sum()
    count_Y = (np.array(seq) == utils.aa_to_ord('Y')).sum()
    return count_S + count_T + count_Y

# Counting total number of AA where phosphorelation may occurr 
total_count = data.seq.apply(count_STY).sum()
print('Total number of [S T Y] AAs : %d' % total_count)

# Counting total number of phosphorelations
ph_count = data.phos_site_cnt.sum()
print('Total number of phosphorelations : %d' % ph_count)

# Calculating ratio
ratio = ph_count / total_count
print('Ratio : %.5lf' % ratio)

Total number of [S T Y] AAs : 10002126
Total number of phosphorelations : 1146437
Ratio : 0.11462
