In [1]:
## import basic modules
import os,sys
import numpy as np
import pandas as pd
from scipy.stats import chi2
import pysam
from tqdm import tqdm

# add parent folder of the C_to_U_classifier_plus_min package to path in order to be loaded into the current session
sys.path.append("/lustrehome/afonzino")

from C_to_U_classifier_plus_min.utils import get_rev_compl

In [2]:
# define reference, H0 and H1 models filepaths
ref_filepath = "/lustre/bio_running/refs/GRCh38.primary_assembly.genome.filtered.fa"
H0_filepath = "/lustre/bio_running/CtoUclassifier_new_model_training23052022/src_jupyter_notebooks/iForest_cc1_wt_ko_no_indels_mismatches___NEW_TRAINING/illumina_ref_model/No_Editing_Sites_Consensus.txt"
H1_filepath = "/lustre/bio_running/CtoUclassifier_new_model_training23052022/src_jupyter_notebooks/iForest_cc1_wt_ko_no_indels_mismatches___NEW_TRAINING/illumina_ref_model/Editing_Sites_Consensus.txt"

In [3]:
h0 = pd.read_table(H0_filepath, index_col=0)
h1 = pd.read_table(H1_filepath, index_col=0)

In [5]:
def compute_apobec1_signature_pvalue(ref_filepath, H0_filepath, H1_filepath, region, pos1based, strand, verbosity=0):
    ref = pysam.FastaFile(ref_filepath)
    h0 = pd.read_table(H0_filepath, index_col=0)
    h1 = pd.read_table(H1_filepath, index_col=0)
    
    if verbosity > 2:
        print("H0 model matrix")
        print(h0)
        print()
        print("H1 model matrix")
        print(h1)
    
    if h0.shape[1] == h1.shape[1]:
        interval = int((h0.shape[1]-1)/2)
        
    pos0based=pos1based-1
    start = pos0based-interval
    stop = pos0based+interval+1
    
    # get query position sequence of reference
    # assert if strand is correct (either + or -)
    if not (strand == "+" or strand == "-"):
        print("ops! Strange strand!")
        
    # load reference sequence interval around central base position
    reference = ref.fetch(region, start, stop)
    
    if strand == "-":
        reference = get_rev_compl(reference)
        
    if reference[interval] == "C":
        if len(reference) == (interval*2)+1:
            h0_probs = []
            h1_probs = []
            for rel_pos, ref_base in enumerate(reference):
                if verbosity > 1:
                    print(rel_pos, ref_base, h0.loc[ref_base, str(rel_pos)], h1.loc[ref_base, str(rel_pos)])
                h0_probs.append(h0.loc[ref_base, str(rel_pos)])
                h1_probs.append(h1.loc[ref_base, str(rel_pos)])

            h0_probs_prod = np.prod(h0_probs)
            h1_probs_prod = np.prod(h1_probs)
            llr = -2*np.log(h0_probs_prod/h1_probs_prod)
            pvalue = chi2.sf(llr, 1)
            
            if verbosity > 0:
                print("H0 product:", h0_probs_prod)
                print("H1 product:", h1_probs_prod)
                print("llr:", llr)
                print("P-value via Chi-2 test:", pvalue)

            ref.close()

            return pvalue
        else:
            print(f"OPS! Given position {region}:{pos1based} ({strand}) generated an interval too short! Something got wrong!")
            return None
    else:
        print(f"OPS! Wrong position {region}:{pos1based} ({strand}) provided: No C central base! Base found: {reference[interval]}")
        return None
    
# compute example site on 
compute_apobec1_signature_pvalue(ref_filepath, H0_filepath, H1_filepath, "chr8", 145055960, "+", verbosity=3)

H0 model matrix
          0         1         2         3         4         5         6  \
A  0.252602  0.247975  0.257440  0.252242  0.246170  0.253879  0.252672   
C  0.239713  0.249461  0.235333  0.237904  0.250073  0.239440  0.240814   
G  0.238271  0.240898  0.238495  0.241357  0.244064  0.239258  0.241000   
T  0.269413  0.261666  0.268731  0.268497  0.259693  0.267423  0.265513   

          7         8         9  ...        41        42        43        44  \
A  0.245187  0.255523  0.254729  ...  0.265800  0.262512  0.254205  0.264467   
C  0.251557  0.240535  0.240776  ...  0.233990  0.233780  0.245292  0.232105   
G  0.242731  0.240682  0.244176  ...  0.247719  0.250210  0.251595  0.247013   
T  0.260525  0.263261  0.260319  ...  0.252491  0.253498  0.248909  0.256415   

         45        46        47        48        49        50  
A  0.260008  0.255041  0.265657  0.263352  0.255404  0.266164  
C  0.232976  0.243022  0.232280  0.230275  0.242829  0.232475  
G  0.250812  0.

1.0

In [8]:
# let's try with anoter sites without editing evidence chr11:13730199 (+)
compute_apobec1_signature_pvalue(ref_filepath, H0_filepath, H1_filepath, "chr11", 13730199, "+", verbosity=2)

0 T 0.2694134683997705 0.3568681318681319
1 G 0.2408984063466301 0.1835164835164835
2 T 0.2687313735641029 0.3570054945054945
3 T 0.268497012774412 0.3460164835164835
4 T 0.2596927425109485 0.3484890109890109
5 A 0.2538791957577199 0.3307692307692307
6 A 0.2526724125869233 0.3240384615384615
7 A 0.2451868590057505 0.3256868131868132
8 G 0.2406815351681101 0.174587912087912
9 A 0.2547291908606288 0.3322802197802197
10 G 0.2471491933791328 0.1760989010989011
11 A 0.2566040771781562 0.3269230769230769
12 A 0.2564536665220859 0.3075549450549451
13 C 0.2556421485637531 0.1723901098901099
14 A 0.2624421093870241 0.3041208791208791
15 G 0.2462397335982426 0.1791208791208791
16 T 0.2500104937667026 0.3605769230769231
17 T 0.2557645758419499 0.3803571428571428
18 C 0.246264219053882 0.1361263736263736
19 G 0.2402932658001147 0.2037087912087912
20 C 0.2430706160540638 0.1049450549450549
21 A 0.2638797554252773 0.4582417582417582
22 A 0.2434413958108883 0.7695054945054945
23 T 0.2729638594674763 

0.000626317813942372

In [10]:
# try with another one chr3:113005616 (-)
compute_apobec1_signature_pvalue(ref_filepath, H0_filepath, H1_filepath, "chr3", 113005616, "-", verbosity=2)

0 T 0.2694134683997705 0.3568681318681319
1 G 0.2408984063466301 0.1835164835164835
2 A 0.2574400805921282 0.3173076923076923
3 A 0.2522421681521176 0.3221153846153846
4 A 0.2461697751535587 0.328021978021978
5 C 0.2394397727749716 0.1501373626373626
6 T 0.2655132851086454 0.3508241758241758
7 G 0.2427313175973471 0.176510989010989
8 A 0.2555232192077906 0.3244505494505494
9 T 0.260318870590869 0.3494505494505494
10 T 0.2523960767304221 0.3548076923076923
11 C 0.2408494354353514 0.1501373626373626
12 G 0.2460263603419568 0.1743131868131868
13 T 0.2492199633417749 0.3652472527472527
14 G 0.2394502665416742 0.1653846153846153
15 G 0.2462397335982426 0.1791208791208791
16 A 0.249583747254131 0.2943681318681319
17 C 0.2434134124330148 0.1527472527472527
18 G 0.2403667221670327 0.2263736263736263
19 G 0.2402932658001147 0.2037087912087912
20 T 0.2484399266835499 0.3380494505494505
21 G 0.2508080200360985 0.2431318681318681
22 A 0.2434413958108883 0.7695054945054945
23 T 0.2729638594674763 0

0.00884121288179477

It seems to work well. Let's try another editing positions well known.

In [11]:
# chr17:44678043 (-)
compute_apobec1_signature_pvalue(ref_filepath, H0_filepath, H1_filepath, "chr17", 44678043, "-", verbosity=1)

H0 product: 7.68583685493939e-31
H1 product: 9.547624050455146e-29
llr: 9.644166506690038
P-value via Chi-2 test: 0.0018995387136054277


0.0018995387136054277

In [12]:
# chrX:106679225 (+)
compute_apobec1_signature_pvalue(ref_filepath, H0_filepath, H1_filepath, "chrX", 106679225, "+", verbosity=1)

H0 product: 7.287674419221149e-31
H1 product: 7.943605224935361e-29
llr: 9.382705862665443
P-value via Chi-2 test: 0.0021904199709283983


0.0021904199709283983

In [13]:
# try with non editing examples --> chr10:93496869 (+)
compute_apobec1_signature_pvalue(ref_filepath, H0_filepath, H1_filepath, "chr10", 93496869, "+", verbosity=1)

H0 product: 3.556451775004469e-31
H1 product: 8.179183417480786e-36
llr: -21.360193002947454
P-value via Chi-2 test: 1.0


1.0

In [15]:
# chr5:82275087 (-) another negative example
compute_apobec1_signature_pvalue(ref_filepath, H0_filepath, H1_filepath, "chr5", 82275087, "-", verbosity=1)

H0 product: 1.029029414606688e-30
H1 product: 5.312173774110681e-33
llr: -10.532740391820564
P-value via Chi-2 test: 1.0


1.0

In [18]:
# let's try with a wrong position without a central C base (will rise an error message)
# chr2:13081881 (-)
compute_apobec1_signature_pvalue(ref_filepath, H0_filepath, H1_filepath, "chr2", 13081881, "-", verbosity=1)

OPS! Wrong position chr2:13081881 (-) provided: No C central base! Base found: A
