# Naive Bayes PGM

In [1]:
import pandas as pd
import numpy as np
import pgmpy

## Load Data

In [None]:
df_raw = pd.read_csv('crispr13d_lfc.csv')
print(df_raw.head())
print(df_raw.shape)

   CTGCAGGACAGGTGGGCGTGGCCCGAGCNNNNN  -0.0160791495437025
0  GCTGCAGGACAGGTGGGCGTGGCCCGAGCNNNN             0.169027
1  TGCTGCAGGACAGGTGGGCGTGGCCCGAGCNNN             0.068603
2  AGTGCTGCAGGACAGGTGGGCGTGGCCCGAGCN             0.218402
3  CAGTGCTGCAGGACAGGTGGGCGTGGCCCGAGC             0.028038
4  CCAGTGCTGCAGGACAGGTGGGCGTGGCCCGAG            -0.058854
(5725, 2)


### Format Data

In [14]:
df_spacer_split = df_raw['spacer_seq'].apply(lambda x: pd.Series(list(x)))
df_spacer_split.columns = [f'SP{str(i+1).zfill(2)}' for i in range(df_spacer_split.shape[1])]
df = pd.concat([df_spacer_split, df_raw.drop(['spacer_seq', 'dosage (ng)'], axis=1)], axis=1)
df['mean_targeting_efficiency'] = df['mean_targeting_efficiency'].apply(lambda x: ((x+0.05) // 0.1) / 10)
print(df.head())

  SP01 SP02 SP03 SP04 SP05 SP06 SP07 SP08 SP09 SP10  ... SP23 SP24 SP25 SP26  \
0    C    C    A    U    G    U    U    A    U    C  ...    G    C    U    C   
1    G    G    C    C    A    U    G    U    U    A  ...    U    U    G    C   
2    G    G    A    G    C    C    C    U    C    C  ...    A    A    G    C   
3    C    A    C    G    G    A    G    C    C    C  ...    U    U    G    A   
4    G    G    C    G    G    U    C    U    G    G  ...    G    G    G    C   

  SP27 SP28 SP29 SP30 transcript mean_targeting_efficiency  
0    A    C    C    A    mCherry                       0.6  
1    U    C    A    C    mCherry                       1.0  
2    G    C    A    U    mCherry                       0.8  
3    A    G    C    G    mCherry                       0.4  
4    G    G    C    C    mCherry                       1.0  

[5 rows x 32 columns]


### Split Data by Transcript

In [15]:
transcripts = df['transcript'].unique() 
print(transcripts)
df_trans = []
for trans in transcripts:
    df_trans.append(df[df['transcript'] == trans])
    print(df_trans[-1].shape)

print(df_trans[0].head())
print(df_trans[-1].head())

['mCherry' 'BCR-ABL' 'SFPQ-ABL' 'SNX2-ABL' 'Spike' 'eGFP' 'TagBFP']
(86, 32)
(41, 32)
(9, 32)
(9, 32)
(61, 32)
(20, 32)
(19, 32)
  SP01 SP02 SP03 SP04 SP05 SP06 SP07 SP08 SP09 SP10  ... SP23 SP24 SP25 SP26  \
0    C    C    A    U    G    U    U    A    U    C  ...    G    C    U    C   
1    G    G    C    C    A    U    G    U    U    A  ...    U    U    G    C   
2    G    G    A    G    C    C    C    U    C    C  ...    A    A    G    C   
3    C    A    C    G    G    A    G    C    C    C  ...    U    U    G    A   
4    G    G    C    G    G    U    C    U    G    G  ...    G    G    G    C   

  SP27 SP28 SP29 SP30 transcript mean_targeting_efficiency  
0    A    C    C    A    mCherry                       0.6  
1    U    C    A    C    mCherry                       1.0  
2    G    C    A    U    mCherry                       0.8  
3    A    G    C    G    mCherry                       0.4  
4    G    G    C    C    mCherry                       1.0  

[5 rows x 32 columns]
 

### Shuffle Data

In [16]:
shuffled_df_trans = []
for df_t in df_trans:
    shuffled_df_trans.append(df_t.sample(frac=1).reset_index(drop=True))
    print(shuffled_df_trans[-1].head())

# print(shuffled_df_trans)

  SP01 SP02 SP03 SP04 SP05 SP06 SP07 SP08 SP09 SP10  ... SP23 SP24 SP25 SP26  \
0    G    C    C    G    U    C    C    U    C    G  ...    U    C    G    G   
1    G    C    U    C    G    G    A    G    G    A  ...    C    A    U    C   
2    C    U    C    U    G    C    U    U    G    A  ...    G    G    G    C   
3    C    A    G    G    G    C    G    C    C    G  ...    A    U    C    C   
4    G    G    G    G    A    A    G    G    A    C  ...    U    C    G    G   

  SP27 SP28 SP29 SP30 transcript mean_targeting_efficiency  
0    A    G    G    A    mCherry                       1.0  
1    G    U    C    U    mCherry                       0.7  
2    G    C    C    G    mCherry                       0.4  
3    G    C    U    C    mCherry                       1.0  
4    G    G    A    U    mCherry                       1.0  

[5 rows x 32 columns]
  SP01 SP02 SP03 SP04 SP05 SP06 SP07 SP08 SP09 SP10  ... SP23 SP24 SP25 SP26  \
0    G    G    C    C    G    C    U    G    A    

### Split Data - Train/Test - 80/20

In [17]:
train_lengths = [int(df_t.shape[0]*0.8) for df_t in shuffled_df_trans]
print(train_lengths)
train_data = pd.concat([shuffled_df_trans[i].iloc[:train_lengths[i]] for i in range(len(train_lengths))]).reset_index(drop=1)
test_data = pd.concat([shuffled_df_trans[i].iloc[train_lengths[i]:] for i in range(len(train_lengths))]).reset_index(drop=1)

print("Train Data")
print(train_data.head())
print(train_data.shape)
print()

print("Test Data:")
print(test_data.head())
print(test_data.shape)

[68, 32, 7, 7, 48, 16, 15]
Train Data
  SP01 SP02 SP03 SP04 SP05 SP06 SP07 SP08 SP09 SP10  ... SP23 SP24 SP25 SP26  \
0    G    C    C    G    U    C    C    U    C    G  ...    U    C    G    G   
1    G    C    U    C    G    G    A    G    G    A  ...    C    A    U    C   
2    C    U    C    U    G    C    U    U    G    A  ...    G    G    G    C   
3    C    A    G    G    G    C    G    C    C    G  ...    A    U    C    C   
4    G    G    G    G    A    A    G    G    A    C  ...    U    C    G    G   

  SP27 SP28 SP29 SP30 transcript mean_targeting_efficiency  
0    A    G    G    A    mCherry                       1.0  
1    G    U    C    U    mCherry                       0.7  
2    G    C    C    G    mCherry                       0.4  
3    G    C    U    C    mCherry                       1.0  
4    G    G    A    U    mCherry                       1.0  

[5 rows x 32 columns]
(193, 32)

Test Data:
  SP01 SP02 SP03 SP04 SP05 SP06 SP07 SP08 SP09 SP10  ... SP23 SP24 SP2

### Test mean estimate accuracy

In [18]:
mean_est = np.mean(train_data['mean_targeting_efficiency'])
print("Always guess efficiency of:", mean_est)
mean_errors = test_data['mean_targeting_efficiency'] - mean_est
print("Errors:")
print(mean_errors)
print("Loss: ", np.sum(np.square(mean_errors)))

Always guess efficiency of: 0.7487046632124353
Errors:
0     0.251295
1    -0.148705
2     0.251295
3    -0.448705
4     0.251295
5    -0.248705
6    -0.448705
7     0.151295
8    -0.748705
9    -0.348705
10    0.251295
11    0.251295
12   -0.248705
13   -0.648705
14   -0.348705
15   -0.248705
16    0.251295
17    0.151295
18   -0.148705
19   -0.248705
20   -0.048705
21   -0.248705
22    0.051295
23    0.151295
24    0.151295
25    0.151295
26   -0.048705
27    0.051295
28    0.251295
29    0.151295
30    0.151295
31    0.051295
32    0.051295
33    0.151295
34    0.051295
35   -0.348705
36    0.151295
37    0.151295
38   -0.048705
39    0.051295
40    0.151295
41    0.151295
42    0.151295
43    0.151295
44    0.251295
45    0.151295
46    0.251295
47    0.051295
48   -0.448705
49   -0.148705
50    0.051295
51    0.051295
Name: mean_targeting_efficiency, dtype: float64
Loss:  3.2682737791618566


## Prepare Bayesian Network - Naive Bayes

In [28]:
from pgmpy.models import BayesianNetwork

model = BayesianNetwork()
model.add_edges_from([(df.columns[-1], s) for s in df.columns[:-1]])
model.to_graphviz().draw("naive-bayes.png", prog="dot")
# model_daft = model.to_daft()
# model_daft.render()
# model_daft.savefig('naive-bayes-daft.png')

from pgmpy.estimators import MaximumLikelihoodEstimator
model.fit(train_data, estimator=MaximumLikelihoodEstimator)
cpds = model.get_cpds()
for cpd in cpds:
    print(cpd)

# DO NOT UNCOMMENT - LOADING MODELS KILLED MY PC TWICE!!!
# model.save('naive-model.bif', filetype='bif')

+---------------------------------+------------+
| mean_targeting_efficiency(-0.1) | 0.00518135 |
+---------------------------------+------------+
| mean_targeting_efficiency(0.0)  | 0.0207254  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.1)  | 0.015544   |
+---------------------------------+------------+
| mean_targeting_efficiency(0.2)  | 0.0259067  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.3)  | 0.0414508  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.4)  | 0.0414508  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.5)  | 0.0466321  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.6)  | 0.0466321  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.7)  | 0.0984456  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.8)  | 0.186528   |
+-------------------

### Train BN on Data

In [29]:
test_pred = model.predict(test_data.iloc[:, :-1])
print(test_pred)

100%|██████████| 52/52 [00:00<00:00, 1600.16it/s]

    mean_targeting_efficiency
0                         1.0
1                         1.0
2                         0.3
3                         1.0
4                         0.9
5                         1.0
6                         1.0
7                         1.0
8                         1.0
9                         1.0
10                        1.0
11                        1.0
12                        1.0
13                        1.0
14                        0.9
15                        0.3
16                        0.8
17                        0.6
18                        0.7
19                        0.9
20                        0.9
21                        0.7
22                        0.9
23                        1.0
24                        0.7
25                        0.8
26                        0.8
27                        0.9
28                        1.0
29                        1.0
30                        0.8
31                        0.8
32        




### Run BN on Data

In [30]:
errors = test_data['mean_targeting_efficiency'] - test_pred['mean_targeting_efficiency']
print("Errors:")
print(errors)
print("Loss: ", np.sum(np.square(errors)))

Errors:
0     0.0
1    -0.4
2     0.7
3    -0.7
4     0.1
5    -0.5
6    -0.7
7    -0.1
8    -1.0
9    -0.6
10    0.0
11    0.0
12   -0.5
13   -0.9
14   -0.5
15    0.2
16    0.2
17    0.3
18   -0.1
19   -0.4
20   -0.2
21   -0.2
22   -0.1
23   -0.1
24    0.2
25    0.1
26   -0.1
27   -0.1
28    0.0
29   -0.1
30    0.1
31    0.0
32   -0.1
33    0.0
34    0.1
35   -0.4
36    0.0
37    0.0
38   -0.1
39    0.0
40    0.1
41    0.0
42    0.1
43    0.0
44    0.0
45   -0.1
46    0.2
47    0.0
48   -0.6
49   -0.3
50   -0.1
51    0.0
Name: mean_targeting_efficiency, dtype: float64
Loss:  5.82


## Another model - Modified NB: Transcript becomes parent of Genes

In [31]:
from pgmpy.models import BayesianNetwork

model2 = BayesianNetwork()
model2.add_edges_from([(df.columns[-1], s) for s in df.columns[:-2]])
model2.add_edges_from([(df.columns[-2], s) for s in df.columns[:-2]])
model2.to_graphviz().draw("naive-bayes2.png", prog="dot")
# model_daft = model.to_daft()
# model_daft.render()
# model_daft.savefig('naive-bayes2.png')

from pgmpy.estimators import MaximumLikelihoodEstimator
model2.fit(train_data, estimator=MaximumLikelihoodEstimator)
cpds = model2.get_cpds()
for cpd in cpds:
    print(cpd)

# DO NOT UNCOMMENT - LOADING MODELS KILLED MY PC TWICE!!!
# model.save('naive-model.bif', filetype='bif')

+---------------------------------+------------+
| mean_targeting_efficiency(-0.1) | 0.00518135 |
+---------------------------------+------------+
| mean_targeting_efficiency(0.0)  | 0.0207254  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.1)  | 0.015544   |
+---------------------------------+------------+
| mean_targeting_efficiency(0.2)  | 0.0259067  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.3)  | 0.0414508  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.4)  | 0.0414508  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.5)  | 0.0466321  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.6)  | 0.0466321  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.7)  | 0.0984456  |
+---------------------------------+------------+
| mean_targeting_efficiency(0.8)  | 0.186528   |
+-------------------

### Train BN on Data

In [32]:
test_pred = model2.predict(test_data.iloc[:, :-1])
print(test_pred)

100%|██████████| 52/52 [00:00<00:00, 2299.95it/s]

    mean_targeting_efficiency
0                         1.0
1                         1.0
2                         1.0
3                         1.0
4                        -0.1
5                         1.0
6                        -0.1
7                         1.0
8                         1.0
9                         1.0
10                        1.0
11                        1.0
12                        1.0
13                        1.0
14                        1.0
15                        1.0
16                       -0.1
17                        1.0
18                        0.3
19                        0.3
20                        0.3
21                        0.3
22                        0.3
23                        0.3
24                        0.3
25                        0.3
26                        0.3
27                        0.7
28                        0.7
29                        0.7
30                        0.7
31                        0.5
32        


  phi.values = phi.values / phi.values.sum()
  phi.values = phi.values / phi.values.sum()
  phi.values = phi.values / phi.values.sum()


### Run BN on Data

In [33]:
errors = test_data['mean_targeting_efficiency'] - test_pred['mean_targeting_efficiency']
print("Errors:")
print(errors)
print("Loss: ", np.sum(np.square(errors)))

Errors:
0     0.0
1    -0.4
2     0.0
3    -0.7
4     1.1
5    -0.5
6     0.4
7    -0.1
8    -1.0
9    -0.6
10    0.0
11    0.0
12   -0.5
13   -0.9
14   -0.6
15   -0.5
16    1.1
17   -0.1
18    0.3
19    0.2
20    0.4
21    0.2
22    0.5
23    0.6
24    0.6
25    0.6
26    0.4
27    0.1
28    0.3
29    0.2
30    0.2
31    0.3
32   -0.1
33    0.4
34    0.3
35   -0.1
36    0.4
37    0.0
38   -0.2
39    0.3
40    0.4
41    0.0
42    0.0
43    0.1
44    0.5
45    0.4
46    0.5
47    0.3
48   -0.7
49   -0.4
50   -0.2
51   -0.2
Name: mean_targeting_efficiency, dtype: float64
Loss:  10.83
