# Code of distribution fitting

### Importing libraries

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
from scipy.stats import beta
from scipy.stats import shapiro
from scipy.stats import normaltest
import statistics
pd.set_option('display.max_rows', 120)
from fitter import Fitter, get_common_distributions, get_distributions

### Hyperparameters

In [3]:
filepathH = r'C:/Users/xatzo/Desktop/statistical_UTR/IGH_modified.xlsx'
filepathK = r'C:/Users/xatzo/Desktop/statistical_UTR/IGK_modified.xlsx'
filepathL = r'C:/Users/xatzo/Desktop/statistical_UTR/IGL_modified.xlsx'

### Read data

In [9]:
# Read excel
dfH = pd.read_excel (filepathH)
dfK = pd.read_excel (filepathK)
dfL = pd.read_excel (filepathL)

# Main

## Preprocessing

In [42]:
#IGHV1
# Check for correct
dfH1 = dfH[dfH["GENE"].str.slice(0,5)=="IGHV1"] 
# Drop bad series
dfH1 = dfH1.drop(20)
# Main
pyr_tata1 = dfH1["NT PYRIMIDINE-RICH AND L-PART1-2nd"] - dfH1["NT TATA-BOX AND L-PART1-3rd"]-dfH1["TATA-BOX-3rd"].apply(len)
tata_hept1 = dfH1["NT TATA-BOX AND L-PART1-3rd"] - dfH1["NT HEPTA AND L-PART1"]-7
hept_oct1 = dfH1["NT HEPTA AND L-PART1"] - dfH1["NT 8MER AND L-PART1"]-8
oct_tata1 = dfH1["NT 8MER AND L-PART1"] - dfH1["NT TATA-BOX AND L-PART1"]-dfH1["TATA-BOX"].apply(len)
tata_L1 = dfH1["NT TATA-BOX AND L-PART1"]

#IGHV2
dfH2 = dfH[dfH["GENE"].str.slice(0,5)=="IGHV2"] 
hept_oct2 = dfH2["NT HEPTA AND L-PART1"] - dfH2["NT 8MER AND L-PART1"]-8
oct_tata2 = dfH2["NT 8MER AND L-PART1"] - dfH2["NT TATA-BOX AND L-PART1"]-dfH1["TATA-BOX"].apply(len)
tata_L2 = dfH2["NT TATA-BOX AND L-PART1"]

#IGHV3
dfH3 = dfH[dfH["GENE"].str.slice(0,5)=="IGHV3"] 
dfH3 = dfH3.drop(68)
dfH3 = dfH3.drop(51)
dfH3 = dfH3.drop(50)
dfH3 = dfH3.drop(44)
dfH3 = dfH3.drop(34)
pyr_hept3 = dfH3["NT PYRIMIDINE-RICH AND L-PART1-2nd"]-dfH3["NT HEPTA AND L-PART1"]-7
hept_pyr3 = dfH3["NT HEPTA AND L-PART1"]-dfH3["NT PYRIMIDINE-RICH AND L-PART1"]-dfH3["PYRIMIDINE-RICH-2nd"].apply(len)
pyr_tat3 = dfH3["NT PYRIMIDINE-RICH AND L-PART1"]-dfH3["NT TATA-BOX AND L-PART1-2nd"]-dfH3["TATA-BOX-2nd"].apply(len)
tat_oct3 = dfH3["NT TATA-BOX AND L-PART1-2nd"]-dfH3["NT 8MER AND L-PART1"]-8
oct_tata3 = dfH3["NT 8MER AND L-PART1"]-dfH3["NT TATA-BOX AND L-PART1"]-dfH3["TATA-BOX"].apply(len)
tata_L3 = dfH3["NT TATA-BOX AND L-PART1"]

#IGHV4
dfH4 = dfH[dfH["GENE"].str.slice(0,5)=="IGHV4"] 
pyr_hept4 = dfH4["NT PYRIMIDINE-RICH AND L-PART1-2nd"]-dfH4["NT HEPTA AND L-PART1"]-7
hept_pyr4 = dfH4["NT HEPTA AND L-PART1"]-dfH4["NT PYRIMIDINE-RICH AND L-PART1"]-dfH4["PYRIMIDINE-RICH-2nd"].apply(len)
pyr_oct4 = dfH4["NT PYRIMIDINE-RICH AND L-PART1"]-dfH4["NT 8MER AND L-PART1"]-8
oct_tata4 = dfH4["NT 8MER AND L-PART1"]-dfH4["NT TATA-BOX AND L-PART1"]-dfH4["TATA-BOX"].apply(len)
tata_ebox4 = dfH4["NT TATA-BOX AND L-PART1"]-dfH4["NT E-BOX AND L-PART1"]-dfH4["E-BOX"].apply(len)
ebox_L4 = dfH4["NT E-BOX AND L-PART1"]

#IGHV5
dfH5 = dfH[dfH["GENE"].str.slice(0,5)=="IGHV5"] 
pyr_hept5 = dfH5["NT PYRIMIDINE-RICH AND L-PART1-2nd"]-dfH5["NT HEPTA AND L-PART1"]-7
hept_oct5 = dfH5["NT HEPTA AND L-PART1"]-dfH5["NT 8MER AND L-PART1"]-8
oct_ebox5 = dfH5["NT 8MER AND L-PART1"]-dfH5["NT E-BOX AND L-PART1-2nd"]-dfH5["E-BOX-2nd"].apply(len)
ebox_tata5 = dfH5["NT E-BOX AND L-PART1-2nd"]-dfH5["NT TATA-BOX AND L-PART1"]-dfH5["TATA-BOX"].apply(len)
tata_ebox5 = dfH5["NT TATA-BOX AND L-PART1"]-dfH5["NT E-BOX AND L-PART1"]-dfH5["E-BOX"].apply(len)
ebox_L5 = dfH5["NT E-BOX AND L-PART1"]

#IGHV6
dfH6 = dfH[dfH["GENE"].str.slice(0,5)=="IGHV6"] 
pyr_tata6 = dfH6["NT PYRIMIDINE-RICH AND L-PART1-2nd"] - dfH6["NT TATA-BOX AND L-PART1-3rd"]-dfH6["TATA-BOX-3rd"].apply(len)
tata_hept6 = dfH6["NT TATA-BOX AND L-PART1-3rd"] - dfH6["NT HEPTA AND L-PART1"]-7
hept_oct6 = dfH6["NT HEPTA AND L-PART1"] - dfH6["NT 8MER AND L-PART1"]-8
oct_tata6 = dfH6["NT 8MER AND L-PART1"] - dfH6["NT TATA-BOX AND L-PART1"]-dfH6["TATA-BOX"].apply(len)
tata_L6 = dfH6["NT TATA-BOX AND L-PART1"]

#IGHV7
dfH7 = dfH[dfH["GENE"].str.slice(0,5)=="IGHV7"] 
dfH7 = dfH7.drop(125)
pyr_tata7 = dfH7["NT PYRIMIDINE-RICH AND L-PART1-2nd"] - dfH7["NT TATA-BOX AND L-PART1-3rd"]-dfH7["TATA-BOX-3rd"].apply(len)
tata_hept7 = dfH7["NT TATA-BOX AND L-PART1-3rd"] - dfH7["NT HEPTA AND L-PART1"]-7
hept_oct7 = dfH7["NT HEPTA AND L-PART1"] - dfH7["NT 8MER AND L-PART1"]-8
oct_tata7 = dfH7["NT 8MER AND L-PART1"] - dfH7["NT TATA-BOX AND L-PART1"]-dfH7["TATA-BOX"].apply(len)
tata_L7 = dfH7["NT TATA-BOX AND L-PART1"]

### p value for gaussian

IGHV1

In [48]:
shap_pyr_tat1 = shapiro(pyr_tata1)
print(shap_pyr_tat1[1])

shap_hept_pyr3 = shapiro(hept_pyr3)
print(shap_hept_pyr3[1])

shap_pyr_tat3 = shapiro(pyr_tat3)
print(shap_pyr_tat3[1])

shap_tat_oct3 = shapiro(tat_oct3)
print(shap_tat_oct3[1])

shap_oct_tat3 = shapiro(oct_tata3)
print(shap_oct_tat3[1])

shap_tata_L3 = shapiro(tata_L3)
print(shap_tata_L3[1])

3.7031693409517175e-06
1.4496423546006554e-06
1.1993599169368707e-10
4.931699450261169e-17
1.0
1.998862907874946e-13


IGHV3

In [51]:
shap_pyr_hept3 = normaltest(pyr_hept3)
print(shap_pyr_hept3[1])

shap_tat_hept1 = normaltest(tata_hept1)
print(shap_tat_hept1[1])

shap_hept_oct1 = normaltest(hept_oct1)
print(shap_hept_oct1[1])

shap_oct_tat1 = normaltest(oct_tata1)
print(shap_oct_tat1[1])

shap_tata_L1 = normaltest(tata_L1)
print(shap_tata_L1[1])

8.797437907915149e-09
0.010846242872071825
4.71583266956849e-102
4.71583266956849e-102
0.000156618081726514


### Fit to gaussian

In [None]:
mean, var  = scipy.stats.distributions.norm.fit(gap1)
print(mean)
print(var)
median = statistics.median(gap1)
print(median)
x = np.linspace(mean - 3*var, mean + 3*var, 100)
plt.xlim(mean-4*var, mean+4*var)
plt.plot(x, scipy.stats.norm.pdf(x, mean, var))
plt.show()

### Fit t distribution

deg, mean, std = scipy.stats.t.fit(gap1)
print(deg)
print(mean)
print(std)
rv = scipy.stats.t(df=deg, loc=mean, scale=std)
x = np.linspace(rv.ppf(0.0001), rv.ppf(0.9999), 100)
y = rv.pdf(x) 
plt.xlim(mean-4*std,mean+4*std)
plt.plot(x,y)