## Logisitic regression models for paralog features

**Input:** Gene summary file

In [1]:
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices
from scipy import stats

get_local_data_path = lambda folders, fname: os.path.normpath('../data/'+'/'.join(folders) +'/'+fname)

file_gene_summary = get_local_data_path(['processed','results'], 'gene_summary_0.47.csv')

In [2]:
gene_data = pd.read_csv(file_gene_summary, index_col=0)
paralogs = gene_data[gene_data.num_paralogs > 0]

# Bin num paralogs, encode WGD = 1, SSD = 0, Broadly vs. the rest for essentiality category
paralogs = paralogs.assign(num_paralogs_binned = paralogs.num_paralogs.apply(lambda x: x if x < 4 else 4),
                           category = paralogs.category.apply(lambda x: 1 if x=='Sometimes' else 2 if x=='Broadly' else 0))
print('Num paralogs:', paralogs.shape[0])
paralogs[:2]

Num paralogs: 10130


Unnamed: 0_level_0,essential_percent,category,ensembl_id,symbol,percent_matched_in_paralog,num_paralogs,WGD,num_paralogs_binned
entrez_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8036,0.069892,1,ENSG00000108061,SHOC2,0.254296,2,SSD,2
25940,0.0,0,ENSG00000119812,FAM98A,0.426641,2,WGD,2


In [3]:
# Calculate the log likelihood for the full model and for models that each use 
# one of the variables (num paralogs, sequence identity and duplication mode)

model = smf.mnlogit(formula='category ~ C(num_paralogs_binned) + percent_matched_in_paralog + C(WGD)', data=paralogs).fit()
LL_full = model.llf # Log-likelihood of model
print('Full model, log likelihood:', LL_full, '\n')

model = smf.mnlogit(formula='category ~ C(num_paralogs_binned)', data=paralogs).fit()
LL_num_paralogs = model.llf # Log-likelihood of model
print('Num paralogs model, log likelihood:', LL_num_paralogs, '\n')

model = smf.mnlogit(formula='category ~ percent_matched_in_paralog', data=paralogs).fit()
LL_seq_id = model.llf # Log-likelihood of model
print('Seq id model, log likelihood:', LL_seq_id, '\n')

model = smf.mnlogit(formula="category ~ C(WGD)", data=paralogs).fit()
#print(model.summary())
LL_WGD = model.llf # Log-likelihood of model
print('Duplication mode model, log likelihood:', LL_WGD)

Optimization terminated successfully.
         Current function value: 0.769568
         Iterations 9
Full model, log likelihood: -7795.725699549412 

Optimization terminated successfully.
         Current function value: 0.773981
         Iterations 9
Num paralogs model, log likelihood: -7840.426222038027 

Optimization terminated successfully.
         Current function value: 0.781781
         Iterations 8
Seq id model, log likelihood: -7919.440666913124 

Optimization terminated successfully.
         Current function value: 0.777854
         Iterations 8
Duplication mode model, log likelihood: -7879.656784736069


In [4]:
# Compare log-likelihood of the models
def lrtest(llmin, llmax):
    lr = 2 * (llmax - llmin)
    p = stats.chi2.sf(lr, df=2) # llmax has 2 DOF more than llmin
    return lr, p

print('Full vs. num paralogs:', lrtest(LL_num_paralogs, LL_full))
print('Full vs. seq id:', lrtest(LL_seq_id, LL_full))
print('Full vs. duplication mode:', lrtest(LL_WGD, LL_full)) # L2 has X DoF more than L1

Full vs. num paralogs: (89.40104497723041, 3.8619775512609794e-20)
Full vs. seq id: (247.42993472742455, 1.8675504604222055e-54)
Full vs. duplication mode: (167.86217037331517, 3.5415456428293916e-37)
