In [1]:
import pandas as pd
import numpy as np
import pgmpy

In [2]:
df_raw = pd.read_csv('Data_high_dosage.csv')
print(df_raw.head())
print(df_raw.shape)

                       spacer_seq transcript  dosage (ng)  \
0  CCAUGUUAUCCUCCUCGCCCUUGCUCACCA    mCherry           20   
1  GGCCAUGUUAUCCUCCUCGCCCUUGCUCAC    mCherry           20   
2  GGAGCCCUCCAUGUGCACCUUGAAGCGCAU    mCherry           20   
3  CACGGAGCCCUCCAUGUGCACCUUGAAGCG    mCherry           20   
4  GGCGGUCUGGGUGCCCUCGUAGGGGCGGCC    mCherry           20   

   mean_targeting_efficiency  
0                   0.560839  
1                   0.994959  
2                   0.837032  
3                   0.409467  
4                   0.994368  
(245, 4)


In [3]:
df_spacer_split = df_raw['spacer_seq'].apply(lambda x: pd.Series(list(x)))
df_spacer_split.columns = [f'SP{str(i+1).zfill(2)}' for i in range(df_spacer_split.shape[1])]
df = pd.concat([df_spacer_split, df_raw.drop(['spacer_seq', 'dosage (ng)'], axis=1)], axis=1)
df['mean_targeting_efficiency'] = df['mean_targeting_efficiency'].apply(lambda x: ((x+0.05) // 0.1) / 10)
print(df.head())

  SP01 SP02 SP03 SP04 SP05 SP06 SP07 SP08 SP09 SP10  ... SP23 SP24 SP25 SP26  \
0    C    C    A    U    G    U    U    A    U    C  ...    G    C    U    C   
1    G    G    C    C    A    U    G    U    U    A  ...    U    U    G    C   
2    G    G    A    G    C    C    C    U    C    C  ...    A    A    G    C   
3    C    A    C    G    G    A    G    C    C    C  ...    U    U    G    A   
4    G    G    C    G    G    U    C    U    G    G  ...    G    G    G    C   

  SP27 SP28 SP29 SP30 transcript mean_targeting_efficiency  
0    A    C    C    A    mCherry                       0.6  
1    U    C    A    C    mCherry                       1.0  
2    G    C    A    U    mCherry                       0.8  
3    A    G    C    G    mCherry                       0.4  
4    G    G    C    C    mCherry                       1.0  

[5 rows x 32 columns]


In [4]:
transcripts = df['transcript'].unique() 
print(transcripts)
df_trans = []
for trans in transcripts:
    df_trans.append(df[df['transcript'] == trans])
    print(df_trans[-1].shape)

print(df_trans[0].head())
print(df_trans[-1].head())

['mCherry' 'BCR-ABL' 'SFPQ-ABL' 'SNX2-ABL' 'Spike' 'eGFP' 'TagBFP']
(86, 32)
(41, 32)
(9, 32)
(9, 32)
(61, 32)
(20, 32)
(19, 32)
  SP01 SP02 SP03 SP04 SP05 SP06 SP07 SP08 SP09 SP10  ... SP23 SP24 SP25 SP26  \
0    C    C    A    U    G    U    U    A    U    C  ...    G    C    U    C   
1    G    G    C    C    A    U    G    U    U    A  ...    U    U    G    C   
2    G    G    A    G    C    C    C    U    C    C  ...    A    A    G    C   
3    C    A    C    G    G    A    G    C    C    C  ...    U    U    G    A   
4    G    G    C    G    G    U    C    U    G    G  ...    G    G    G    C   

  SP27 SP28 SP29 SP30 transcript mean_targeting_efficiency  
0    A    C    C    A    mCherry                       0.6  
1    U    C    A    C    mCherry                       1.0  
2    G    C    A    U    mCherry                       0.8  
3    A    G    C    G    mCherry                       0.4  
4    G    G    C    C    mCherry                       1.0  

[5 rows x 32 columns]
 

In [5]:
shuffled_df_trans = []
for df_t in df_trans:
    shuffled_df_trans.append(df_t.sample(frac=1).reset_index(drop=True))
    print(shuffled_df_trans[-1].head())

# print(shuffled_df_trans)

  SP01 SP02 SP03 SP04 SP05 SP06 SP07 SP08 SP09 SP10  ... SP23 SP24 SP25 SP26  \
0    C    C    U    C    U    G    C    U    U    G  ...    A    G    G    G   
1    G    G    G    U    A    C    A    U    C    C  ...    C    C    U    C   
2    C    G    G    A    G    G    A    G    G    C  ...    C    G    U    C   
3    C    U    C    G    U    C    C    A    U    G  ...    U    G    G    C   
4    G    C    C    U    C    U    G    C    U    U  ...    C    A    G    G   

  SP27 SP28 SP29 SP30 transcript mean_targeting_efficiency  
0    C    G    C    C    mCherry                       0.2  
1    C    C    A    G    mCherry                       1.0  
2    U    U    C    U    mCherry                       0.8  
3    G    G    C    C    mCherry                       0.0  
4    G    C    G    C    mCherry                       0.6  

[5 rows x 32 columns]
  SP01 SP02 SP03 SP04 SP05 SP06 SP07 SP08 SP09 SP10  ... SP23 SP24 SP25 SP26  \
0    U    G    C    G    U    C    U    C    C    

In [6]:
spacer_len = df_trans[0].shape[1] - 3
print(f"Space length is: {spacer_len}")

Space length is: 29


In [7]:
train_lengths = [int(df_t.shape[0]*0.8) for df_t in shuffled_df_trans]
print(train_lengths)
train_data = pd.concat([shuffled_df_trans[i].iloc[:train_lengths[i]] for i in range(len(train_lengths))])
test_data = pd.concat([shuffled_df_trans[i].iloc[train_lengths[i]:] for i in range(len(train_lengths))])

print("Train Data")
print(train_data.head())
print(train_data.shape)
print()

print("Test Data:")
print(test_data.head())
print(test_data.shape)

[68, 32, 7, 7, 48, 16, 15]
Train Data
  SP01 SP02 SP03 SP04 SP05 SP06 SP07 SP08 SP09 SP10  ... SP23 SP24 SP25 SP26  \
0    C    C    U    C    U    G    C    U    U    G  ...    A    G    G    G   
1    G    G    G    U    A    C    A    U    C    C  ...    C    C    U    C   
2    C    G    G    A    G    G    A    G    G    C  ...    C    G    U    C   
3    C    U    C    G    U    C    C    A    U    G  ...    U    G    G    C   
4    G    C    C    U    C    U    G    C    U    U  ...    C    A    G    G   

  SP27 SP28 SP29 SP30 transcript mean_targeting_efficiency  
0    C    G    C    C    mCherry                       0.2  
1    C    C    A    G    mCherry                       1.0  
2    U    U    C    U    mCherry                       0.8  
3    G    G    C    C    mCherry                       0.0  
4    G    C    G    C    mCherry                       0.6  

[5 rows x 32 columns]
(193, 32)

Test Data:
   SP01 SP02 SP03 SP04 SP05 SP06 SP07 SP08 SP09 SP10  ... SP23 SP24 SP

In [8]:
from pgmpy.models import BayesianNetwork

model = BayesianNetwork()
model.add_edges_from([(df.columns[-1], s) for s in df.columns[:-1]])
model.to_graphviz().draw("naive-bayes.png", prog="dot")
# model_daft = model.to_daft()
# model_daft.render()
# model_daft.savefig('naive-bayes2.png')

from pgmpy.estimators import MaximumLikelihoodEstimator
model.fit(train_data, estimator=MaximumLikelihoodEstimator)
cpds = model.get_cpds()
for cpd in cpds:
    print(cpd)

# DO NOT UNCOMMENT - LOADING MODELS KILLED MY PC TWICE!!!
# model.save('naive-model.bif', filetype='bif')

  from .autonotebook import tqdm as notebook_tqdm


+---------------------------------+------------+
| mean_targeting_efficiency(-0.1) | 0.00518135 |
+---------------------------------+------------+
| mean_targeting_efficiency(0.0)  | 0.0207254  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.1)  | 0.0207254  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.2)  | 0.0259067  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.3)  | 0.0362694  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.4)  | 0.0466321  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.5)  | 0.0621762  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.6)  | 0.0414508  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.7)  | 0.0932642  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.8)  | 0.170984   |
+-------------------

In [9]:
test_pred = model.predict(test_data.iloc[:, :-1])
print(test_pred)

100%|██████████| 52/52 [00:06<00:00,  8.55it/s]


    mean_targeting_efficiency
0                         1.0
1                         0.7
2                         0.6
3                         1.0
4                         1.0
5                         1.0
6                         1.0
7                         1.0
8                         1.0
9                         0.9
10                        1.0
11                        0.5
12                        1.0
13                        1.0
14                        1.0
15                        0.5
16                        1.0
17                        1.0
18                        0.9
19                        0.9
20                        0.7
21                        0.7
22                        0.5
23                        0.9
24                        0.9
25                        0.9
26                        0.9
27                        1.0
28                        0.9
29                        0.8
30                        0.9
31                        0.8
32        

In [25]:
errors = pd.Series([test_data['mean_targeting_efficiency'].iloc[i] - test_pred['mean_targeting_efficiency'].iloc[i] for i in range(test_pred.shape[0])])
print("Errors:")
print(errors)
print("Loss: ", np.sum(np.square(errors)))

Errors:
0    -0.2
1     0.3
2     0.3
3     0.0
4    -0.4
5     0.0
6    -0.7
7     0.0
8    -0.4
9     0.1
10   -0.7
11   -0.5
12   -0.2
13   -0.2
14    0.0
15    0.4
16   -0.1
17   -0.5
18    0.1
19   -0.3
20   -0.1
21    0.2
22    0.4
23    0.0
24    0.0
25   -0.1
26   -0.2
27    0.0
28    0.0
29    0.0
30   -0.1
31    0.1
32    0.2
33   -0.1
34   -0.2
35   -0.2
36   -0.2
37    0.1
38    0.0
39   -0.2
40    0.2
41    0.1
42   -0.1
43    0.2
44   -0.5
45    0.2
46    0.1
47   -0.5
48   -0.3
49    0.0
50   -0.4
51   -0.6
dtype: float64
Loss:  4.140000000000001


In [26]:
dumb_est = np.mean(train_data['mean_targeting_efficiency'])
print("Always guess efficiency of:", dumb_est)
errors2 = test_data['mean_targeting_efficiency'] - dumb_est
print("Errors:")
print(errors2)
print("Loss: ", np.sum(np.square(errors2)))

Always guess efficiency of: 0.7430051813471503
Errors:
68    0.056995
69    0.256995
70    0.156995
71    0.256995
72   -0.143005
73    0.256995
74   -0.443005
75    0.256995
76   -0.143005
77    0.256995
78   -0.443005
79   -0.743005
80    0.056995
81    0.056995
82    0.256995
83    0.156995
84    0.156995
85   -0.243005
32    0.256995
33   -0.143005
34   -0.143005
35    0.156995
36    0.156995
37    0.156995
38    0.156995
39    0.056995
40   -0.043005
7     0.256995
8     0.156995
7     0.056995
8     0.056995
48    0.156995
49    0.156995
50    0.056995
51   -0.043005
52   -0.043005
53   -0.043005
54    0.056995
55    0.056995
56    0.056995
57    0.156995
58    0.156995
59    0.056995
60    0.156995
16   -0.343005
17    0.256995
18    0.156995
19   -0.443005
15   -0.243005
16    0.056995
17   -0.343005
18   -0.443005
Name: mean_targeting_efficiency, dtype: float64
Loss:  2.7581400842975654
