## cc1 CC vs CT basecalling features comparison

In [1]:
# import basic modules
import os
from glob import glob
import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu, chi2_contingency
from tqdm import tqdm

In [2]:
# define paths
df_CC_filepath = "/lustre/bio_running/C_to_U_editing_minimap2_spliced/cc1.basecalling_features/cc1.CCcontext_reads_features_forward/CCcontext_reads_features_forward_chr*"
df_CT_filepath = "/lustre/bio_running/C_to_U_editing_minimap2_spliced/cc1.basecalling_features/cc1.CTcontext_reads_features_forward_rev/CTcontext_reads_features_forward_rev_chr*"

In [3]:
# load into dataframes CC and CT (CU) read-basecaling features
cols = ["region", "position", "read_name", "strand"] + [f"pos{i}" for i in range(-3,4)] + ["ins"]
df_CC = pd.concat([pd.read_table(tsv, names=cols, index_col=False) for tsv in glob(df_CC_filepath)], axis=0)
df_CT = pd.concat([pd.read_table(tsv, names=cols, index_col=False) for tsv in glob(df_CT_filepath)], axis=0)
# drop erroneous minus strand reads
df_CT = df_CT.query("strand == '+'")
print("df_CC")
print(df_CC)
print("\n\ndf_CT")
print(df_CT)

df_CC
        region  position                             read_name strand  pos-3  \
0         chr4        38  98c29af7-ab4e-475b-82b2-3207a8af75f7      +    7.0   
1         chr4        38  f679d1d5-449a-4bce-a239-b7b149c72751      +   11.0   
2         chr4        38  33ca07f3-1752-4fed-8ca8-b0b197c9bb5f      +    9.0   
3         chr4        38  783175e2-5165-4130-adcd-1ab18431a89c      +    3.0   
4         chr4        38  e1c1674f-d4a8-4810-8196-2276cd9c27cb      +   18.0   
...        ...       ...                                   ...    ...    ...   
2819393   chr2      2484  e8991292-3bc5-496b-833a-ef0da7caf0ed      +   18.0   
2819394   chr2      2484  40ef1f45-a8a9-45cb-aae6-845226d68ec6      +   17.0   
2819395   chr2      2484  afd9ce8d-9fb3-4e47-932a-7205cb5e4de9      +   17.0   
2819396   chr2      2484  e1ee7b4d-00fc-43d4-bd91-e1ce4e7c141b      +   15.0   
2819397   chr2      2484  af144a8c-7874-4576-9b33-5e2bdeaf509f      +    3.0   

         pos-2  pos-1  pos0  pos1

In [4]:
df_CC.describe()

Unnamed: 0,position,pos-3,pos-2,pos-1,pos0,pos1,pos2,pos3,ins
count,11992050.0,11992050.0,11992050.0,11992050.0,11992050.0,11992050.0,11992050.0,11992050.0,11992050.0
mean,1545.599,17.84095,17.58903,17.39471,17.81812,16.75765,17.08225,17.23462,0.1186838
std,728.7768,10.03632,9.873188,9.367261,8.31015,9.364448,10.15135,10.15237,0.3465052
min,37.0,-50.0,-40.0,-42.0,1.0,-41.0,-44.0,-47.0,0.0
25%,979.0,11.0,10.0,10.0,11.0,10.0,9.0,10.0,0.0
50%,1690.0,19.0,19.0,18.0,18.0,18.0,18.0,18.0,0.0
75%,2147.0,25.0,25.0,25.0,24.0,24.0,25.0,25.0,0.0
max,2733.0,90.0,90.0,52.0,49.0,51.0,90.0,90.0,5.0


In [5]:
df_CT.describe()

Unnamed: 0,position,pos-3,pos-2,pos-1,pos0,pos1,pos2,pos3,ins
count,301547.0,301547.0,301547.0,301547.0,301547.0,301547.0,301547.0,301547.0,301547.0
mean,1593.263475,12.294793,9.326987,8.138811,7.186167,9.642815,11.95467,13.411067,0.198092
std,736.710384,10.23274,10.266306,9.543772,4.311163,9.253936,9.735651,10.056592,0.430267
min,37.0,-37.0,-39.0,-36.0,1.0,-36.0,-37.0,-37.0,0.0
25%,1027.0,5.0,2.0,3.0,4.0,4.0,5.0,6.0,0.0
50%,1711.0,12.0,8.0,7.0,6.0,9.0,12.0,14.0,0.0
75%,2191.0,20.0,17.0,15.0,9.0,16.0,19.0,21.0,0.0
max,2733.0,47.0,44.0,43.0,42.0,43.0,53.0,57.0,4.0


In [6]:
# compute statistic testing for central base quality
mannwhitneyu(df_CC["pos0"], df_CT["pos0"])

MannwhitneyuResult(statistic=506290636830.5, pvalue=0.0)

In [7]:
# compute statistic testing for average base quality
mannwhitneyu(df_CC.iloc[:,4:-1].mean(axis=1), df_CT.iloc[:,4:-1].mean(axis=1))

MannwhitneyuResult(statistic=716411711671.0, pvalue=0.0)

In [8]:
#compute statistics testing for deletions (equal to 0)
CC_dels = []
with tqdm(total=df_CC.shape[0]) as pbar:
    for r in df_CC.itertuples():
        read = list(r)[5:-1]
        query = np.array(read) == 0
        #print(read)
        #print(query)
        #print(np.where(query)[0])
        #print(len(np.where(query)[0]))
        CC_dels.append(len(np.where(query)[0]))
        pbar.update(1)
        #print()
CT_dels = []
with tqdm(total=df_CT.shape[0]) as pbar:
    for r in df_CT.itertuples():
        read = list(r)[5:-1]
        query = np.array(read) == 0
        #print(read)
        #print(query)
        #print(np.where(query)[0])
        #print(len(np.where(query)[0]))
        CT_dels.append(len(np.where(query)[0]))
        #print()
        pbar.update(1)

# deletions chi2 test

del_dict = {"CC":[], "CT":[]}

print("CC deletetion count:")
total = np.unique(CC_dels, return_counts=True)[1].sum()
print("Total CC reads:", total)
for del_, count in zip(np.unique(CC_dels, return_counts=True)[0], np.unique(CC_dels, return_counts=True)[1]):
    print(del_,"=", count/total)
    del_dict["CC"].append(count)   
    
print()

print("CT deletion count:")
total = np.unique(CT_dels, return_counts=True)[1].sum()
print("Total CT reads:", total)
for del_, count in zip(np.unique(CT_dels, return_counts=True)[0], np.unique(CT_dels, return_counts=True)[1]):
    print(del_,"=", count/total)
    del_dict["CT"].append(count)

# make same lenghts of verctors    
del_dict["CT"].append(0)

del_dict = pd.DataFrame(del_dict).T
print("\n Contingency Table")
print(del_dict)
print()
chi2_stats, p, dof, ex = chi2_contingency(del_dict)
print(f"Results of chi2 test:\n\t-chi2 statistics: {chi2_stats}\n\t-p_value: {p}")

100%|██████████| 11992051/11992051 [01:11<00:00, 167745.67it/s]
100%|██████████| 301547/301547 [00:01<00:00, 161282.64it/s]


CC deletetion count:
Total CC reads: 11992051
0 = 0.8306963504408045
1 = 0.1163288915298976
2 = 0.0411786941199633
3 = 0.010784310373596643
4 = 0.0008763304959260096
5 = 0.00012541641125442179
6 = 1.0006628557533652e-05

CT deletion count:
Total CT reads: 301547
0 = 0.7058269523490533
1 = 0.21852314896185338
2 = 0.06465990376292916
3 = 0.009942065416004802
4 = 0.000908647739821653
5 = 0.00013928177033762564

 Contingency Table
          0        1       2       3      4     5    6
CC  9961753  1395022  493817  129326  10509  1504  120
CT   212840    65895   19498    2998    274    42    0

Results of chi2 test:
	-chi2 statistics: 35300.05887257856
	-p_value: 0.0


In [9]:
#compute statistics testing for mismatches (less than 0)
CC_mism = []
with tqdm(total=df_CC.shape[0]) as pbar:
    for r in df_CC.itertuples():
        read = list(r)[5:-1]
        query = np.array(read) < 0
        #print(read)
        #print(query)
        #print(np.where(query)[0])
        #print(len(np.where(query)[0]))
        CC_mism.append(len(np.where(query)[0]))
        pbar.update(1)
        #print()
CT_mism = []
with tqdm(total=df_CT.shape[0]) as pbar:
    for r in df_CT.itertuples():
        read = list(r)[5:-1]
        query = np.array(read) < 0
        #print(read)
        #print(query)
        #print(np.where(query)[0])
        #print(len(np.where(query)[0]))
        CT_mism.append(len(np.where(query)[0]))
        #print()
        pbar.update(1)

# mismatches chi2 test
mism_dict = {"CC":[], "CT":[]}

print("CC mismatches count:")
total = np.unique(CC_mism, return_counts=True)[1].sum()
print("Total CC reads:", total)
for mism_, count in zip(np.unique(CC_mism, return_counts=True)[0], np.unique(CC_mism, return_counts=True)[1]):
    print(mism_,"=", count/total)
    mism_dict["CC"].append(count)   
    
print()

print("CT mismatches count:")
total = np.unique(CT_mism, return_counts=True)[1].sum()
print("Total CT reads:", total)
for mism_, count in zip(np.unique(CT_mism, return_counts=True)[0], np.unique(CT_mism, return_counts=True)[1]):
    print(mism_,"=", count/total)
    mism_dict["CT"].append(count)

mism_dict = pd.DataFrame(mism_dict).T
print("\n Contingency Table")
print(mism_dict)
print()
chi2_stats, p, dof, ex = chi2_contingency(mism_dict)
print(f"Results of chi2 test:\n\t-chi2 statistics: {chi2_stats}\n\t-p_value: {p}")

100%|██████████| 11992051/11992051 [01:12<00:00, 165678.38it/s]
100%|██████████| 301547/301547 [00:01<00:00, 162974.00it/s]


CC mismatches count:
Total CC reads: 11992051
0 = 0.8830663745509422
1 = 0.09215112577489873
2 = 0.020824878079654597
3 = 0.0034848917837324075
4 = 0.00042911758797556815
5 = 4.2444782798205246e-05
6 = 1.1674399983789262e-06

CT mismatches count:
Total CT reads: 301547
0 = 0.5654707226402518
1 = 0.2977346814924373
2 = 0.11043717894722879
3 = 0.02339602118409402
4 = 0.0027425243826003907
5 = 0.00019897395762517948
6 = 1.9897395762517948e-05

 Contingency Table
           0        1       2      3     4    5   6
CC  10589777  1105081  249733  41791  5146  509  14
CT    170516    89781   33302   7055   827   60   6

Results of chi2 test:
	-chi2 statistics: 297217.4081919344
	-p_value: 0.0


In [10]:
#compute statistics testing for insertions
CC_ins = df_CC["ins"].values

CT_ins = df_CT["ins"].values


# insertions chi2 test
ins_dict = {"CC":[], "CT":[]}

print("CC insertions count:")
total = np.unique(CC_ins, return_counts=True)[1].sum()
print("Total CC reads:", total)
for ins_, count in zip(np.unique(CC_ins, return_counts=True)[0], np.unique(CC_ins, return_counts=True)[1]):
    print(ins_,"=", count/total)
    ins_dict["CC"].append(count)   
    
print()

print("CT insertions count:")
total = np.unique(CT_ins, return_counts=True)[1].sum()
print("Total CT reads:", total)
for ins_, count in zip(np.unique(CT_ins, return_counts=True)[0], np.unique(CT_ins, return_counts=True)[1]):
    print(ins_,"=", count/total)
    ins_dict["CT"].append(count)

# makes vector of the same length
ins_dict["CT"].append(0)
    
ins_dict = pd.DataFrame(ins_dict).T
print("\n Contingency Table")
print(ins_dict)
print()
chi2_stats, p, dof, ex = chi2_contingency(ins_dict)
print(f"Results of chi2 test:\n\t-chi2 statistics: {chi2_stats}\n\t-p_value: {p}")

CC insertions count:
Total CC reads: 11992051
0.0 = 0.8887028582516869
1.0 = 0.10425055730666923
2.0 = 0.006713697264963266
3.0 = 0.0003257991481190332
4.0 = 7.004639990273557e-06
5.0 = 8.338857131278044e-08

CT insertions count:
Total CT reads: 301547
0.0 = 0.8146259123785016
1.0 = 0.1730708645750082
2.0 = 0.011895326433358647
3.0 = 0.0004012641478774453
4.0 = 6.63246525417265e-06

 Contingency Table
           0        1      2     3   4  5
CC  10657370  1250178  80511  3907  84  1
CT    245648    52189   3587   121   2  0

Results of chi2 test:
	-chi2 statistics: 16130.316103288373
	-p_value: 0.0
