# Motif presence analysis

In [2]:
import pandas as pd

In [3]:
import ast

In [4]:
import numpy as np

## Load the data

In [11]:
df = pd.read_csv('../data/tps_db.csv')

In [14]:
df_unique = df[['id', 'name', 'sequence', 'species', 'kingdom',
       'pfam_architecture', 'supfam_architecture', 'PF06330.14', 'PF01397.24',
       'PF03936.19', 'PF00494.22', 'PF13249.9', 'PF19086.3', 'PF13243.9',
       '0041184', '0053354', '0053355', '0048261', '0048806', '0046340',
       '0047573', 'motifs', 'DDXXD', 'NSE/DTE', 'DXDD', 'length']].drop_duplicates()

## Motifs presence

**Motif definitions**

**Class I**
- **motif DDXXD**:
    - DDXXD
    - DDXX[DE]
- **motif NSE/DTE**
    - [ND]DXX[ST]XXXE
    - [ND]D[LIV]X[ST]XXXE
    - [ND][DE]XX[ST]XX[NKR][DE]
    
**Class II**    
- **motif DXDD**
    - DXDD



In [17]:
df_unique['motifs'] = df_unique['motifs'].apply(lambda x: {} if type(x)==float else ast.literal_eval(x))

In [18]:
df_unique['motifs'].value_counts()

motifs
{DDXXD, NSE/DTE}          527
{DDXXD}                   219
{}                        136
{DXDD, DDXXD, NSE/DTE}     72
{NSE/DTE}                  69
{DXDD}                     42
{DXDD, DDXXD}              40
{DXDD, NSE/DTE}            21
Name: count, dtype: int64

### Class I motifs

864 sequences have the class I DDXXD motif

In [58]:
df_unique[df_unique['DDXXD']].shape

(864, 16)

693 sequences have the class I NSE/DTE motif

In [59]:
df_unique[df_unique['NSE/DTE']].shape

(693, 16)

603 sequences have both DDXXD and NSE/DTE motif

In [60]:
df_unique[df_unique['NSE/DTE'] & df_unique['DDXXD']].shape

(603, 16)

### Class II motif

175 sequences has the Class II DXDD motif

In [63]:
df_unique[df_unique['DXDD']].shape

(175, 16)

### TPS without motif

There are 137 sequences without a motif (12%)

This can be due to slight variations in the motifs

In [76]:
137/df_unique.shape[0]*100

12.09179170344219

In [77]:
df_unique[(df_unique['NSE/DTE']==False) & (df_unique['DDXXD']==False) & (df_unique['DXDD']==False)].shape[0]

137

In [79]:
df_unique['Type (mono, sesq, di, …)']

0       ['sesq' 'mono']
1         ['di' 'sesq']
2                ['di']
3            ['di-int']
4                ['di']
             ...       
1128         ['di-int']
1129           ['sesq']
1130           ['mono']
1131           ['sesq']
1132            ['tri']
Name: Type (mono, sesq, di, …), Length: 1133, dtype: object

In [85]:
cols = ['id','kingdom','type','DDXXD','NSE/DTE','DXDD']

### MonoTPS

In [131]:
sele = df[df['type']=='mono']

In [132]:
sele = sele[cols].drop_duplicates(subset='id')

In [133]:
assert sele['id'].nunique() == sele.shape[0]

In [None]:
sele.shape

(243, 6)

In [140]:
sele[(sele['NSE/DTE']==True) | (sele['DDXXD']==True) | (sele['DXDD']==True)].shape[0]/sele.shape[0]

0.9958847736625515

In [101]:
sele['DDXXD'].value_counts(normalize=True)

DDXXD
True     0.967078
False    0.032922
Name: proportion, dtype: float64

In [103]:
sele['NSE/DTE'].value_counts(normalize=True)

NSE/DTE
True     0.63786
False    0.36214
Name: proportion, dtype: float64

In [104]:
sele['DXDD'].value_counts(normalize=True)

DXDD
False    0.930041
True     0.069959
Name: proportion, dtype: float64

### SesquiTPS

In [141]:
sele = df[df['type'] == 'sesq']
sele = sele[cols].drop_duplicates(subset='id')
assert sele['id'].nunique() == sele.shape[0]

In [None]:
sele[(sele['NSE/DTE']==True) | (sele['DDXXD']==True) | (sele['DXDD']==True)].shape[0]/sele.shape[0]

In [106]:
sele['DDXXD'].value_counts(normalize=True)

DDXXD
True     0.858491
False    0.141509
Name: proportion, dtype: float64

In [107]:
sele['NSE/DTE'].value_counts(normalize=True)

NSE/DTE
True     0.758491
False    0.241509
Name: proportion, dtype: float64

In [108]:
sele['DXDD'].value_counts(normalize=True)

DXDD
False    0.879245
True     0.120755
Name: proportion, dtype: float64

### DiTPS

In [145]:
sele = df[df['type'] == 'di']
sele = sele[cols].drop_duplicates(subset='id')
assert sele['id'].nunique() == sele.shape[0]

In [146]:
sele[(sele['NSE/DTE']==True) | (sele['DDXXD']==True) | (sele['DXDD']==True)].shape[0]/sele.shape[0]

0.9951690821256038

In [None]:
sele['DDXXD'].value_counts(normalize=True)

DDXXD
True     0.78744
False    0.21256
Name: proportion, dtype: float64

In [111]:
sele['NSE/DTE'].value_counts(normalize=True)

NSE/DTE
True     0.628019
False    0.371981
Name: proportion, dtype: float64

In [112]:
sele['DXDD'].value_counts(normalize=True)

DXDD
False    0.613527
True     0.386473
Name: proportion, dtype: float64

### SesterTPS

In [147]:
sele = df[df['type'] == 'sester']
sele = sele[cols].drop_duplicates(subset='id')
assert sele['id'].nunique() == sele.shape[0]

In [148]:
sele[(sele['NSE/DTE']==True) | (sele['DDXXD']==True) | (sele['DXDD']==True)].shape[0]/sele.shape[0]

0.9795918367346939

In [114]:
sele['DDXXD'].value_counts(normalize=True)

DDXXD
True     0.959184
False    0.040816
Name: proportion, dtype: float64

In [115]:
sele['NSE/DTE'].value_counts(normalize=True)

NSE/DTE
True     0.938776
False    0.061224
Name: proportion, dtype: float64

In [116]:
sele['DXDD'].value_counts(normalize=True)

DXDD
False    0.877551
True     0.122449
Name: proportion, dtype: float64

### TriTPS

In [150]:
sele = df[df['type'] == 'tri']
sele = sele[cols].drop_duplicates(subset='id')
assert sele['id'].nunique() == sele.shape[0]

In [151]:
sele[(sele['NSE/DTE']==True) | (sele['DDXXD']==True) | (sele['DXDD']==True)].shape[0]/sele.shape[0]

0.18120805369127516

In [118]:
sele['DDXXD'].value_counts(normalize=True)

DDXXD
False    0.879195
True     0.120805
Name: proportion, dtype: float64

In [119]:
sele['NSE/DTE'].value_counts(normalize=True)

NSE/DTE
False    0.979866
True     0.020134
Name: proportion, dtype: float64

In [120]:
sele['DXDD'].value_counts(normalize=True)

DXDD
False    0.90604
True     0.09396
Name: proportion, dtype: float64

### SesquarTPS

In [152]:
sele = df[df['type'] == 'sesquar']
sele = sele[cols].drop_duplicates(subset='id')
assert sele['id'].nunique() == sele.shape[0]

In [153]:
sele[(sele['NSE/DTE']==True) | (sele['DDXXD']==True) | (sele['DXDD']==True)].shape[0]/sele.shape[0]

0.5

In [None]:
sele['DDXXD'].value_counts(normalize=True)

In [123]:
sele['NSE/DTE'].value_counts(normalize=True)

NSE/DTE
False    1.0
Name: proportion, dtype: float64

In [124]:
sele['DXDD'].value_counts(normalize=True)

DXDD
True     0.5
False    0.5
Name: proportion, dtype: float64

### TetraTPS

In [154]:
sele = df[df['type'] == 'tetra']
sele = sele[cols].drop_duplicates(subset='id')
assert sele['id'].nunique() == sele.shape[0]

In [155]:
sele[(sele['NSE/DTE']==True) | (sele['DDXXD']==True) | (sele['DXDD']==True)].shape[0]/sele.shape[0]

0.3888888888888889

In [128]:
sele['DDXXD'].value_counts(normalize=True)

DDXXD
False    0.611111
True     0.388889
Name: proportion, dtype: float64

In [129]:
sele['NSE/DTE'].value_counts(normalize=True)

NSE/DTE
False    1.0
Name: proportion, dtype: float64

In [130]:
sele['DXDD'].value_counts(normalize=True)

DXDD
False    0.944444
True     0.055556
Name: proportion, dtype: float64