Skip to content

Commit

Permalink
Updated estimated counts
Browse files Browse the repository at this point in the history
Filtered mu<=1
  • Loading branch information
edwwlui committed May 5, 2024
1 parent 10baf86 commit 4f1b825
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 184 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -208,15 +208,15 @@ chrom start end strand gene_name status read_counts(0) read_c
chr1 966803 970277 + PLEKHN1 OK 9,7,7,9,21 3,16,15,17,22 11.473772,9.75089,9.75089,11.473772,17.631809 11.957485,16.183491,15.404332,16.943721,20.49379
chr1 970621 1054485 + PLEKHN1 LO_DATA 1,0,2,0,0 0,0,1,0,0 None,None,None,None,None None,None,None,None,None
```
It lists all introns individually along with their genomic location, status of the test (*OK* or *LO_DATA*), the raw read counts for the intron in all samples, and the model fitted read counts, grouped by condition. If the '--raw-counts-only' option is used, only _raw_ read counts are reported (n.b., this option is compatible with versions of MntJULiP before 1.5). This information can be used, for instance, to further filter introns with low support across all or subsets of the samples, or from genes with low expression levels, or can be used to generate [visualizations](#visualization) such as heatmaps or PCA plots.
It lists all introns individually along with their genomic location, status of the test (*OK*, *LO_DATA* or *NO_OPT* no optimization), the raw read counts for the intron in all samples, and the model fitted read counts, grouped by condition. If the '--raw-counts-only' option is used, only _raw_ read counts are reported (n.b., this option is compatible with versions of MntJULiP before 1.5). This information can be used, for instance, to further filter introns with low support across all or subsets of the samples, or from genes with low expression levels, or can be used to generate [visualizations](#visualization) such as heatmaps or PCA plots.

The file *group_data.txt* contains supporting data for all introns in groups tested by MntJULiP.
```
group_id chrom start end strand gene_name status psi_values(0) psi_values(1) est_psi_values(0) est_psi_values(1)
g000001 chr1 971006 971077 + PLEKHN1 TEST 0.25,0.333333,0.375,0.666667,0.458333 0.454545,0.5,0.44,0.333333,0.478261 0.274425,0.361496,0.404335,0.692248,0.458361 0.484908,0.499894,0.440096,0.333756,0.478239
g000001 chr1 971006 971113 + PLEKHN1 TEST 0.75,0.666667,0.625,0.333333,0.541667 0.545455,0.5,0.56,0.666667,0.521739 0.725575,0.638504,0.595665,0.307752,0.541639 0.515092,0.500106,0.559904,0.666244,0.521761
```
It lists all introns in a group, on separate lines, along with their genomic location, status of the test (*TEST* or *LO_DATA*), and per sample PSI values, both calculated from the raw read counts and estimated by the model, separated by condition. If the '--raw-counts-only' option is used, only PSI values calculated from the _raw_ read counts are reported. This file is new in MntJULiP starting with release 1.5. As with intron data, this information can be used to filter low ratio isoforms or to generate [visualizations](#visualization).
It lists all introns in a group, on separate lines, along with their genomic location, status of the test (*TEST* or *NO_TEST*), and per sample PSI values, both calculated from the raw read counts and estimated by the model, separated by condition. If the '--raw-counts-only' option is used, only PSI values calculated from the _raw_ read counts are reported. This file is new in MntJULiP starting with release 1.5. As with intron data, this information can be used to filter low ratio isoforms or to generate [visualizations](#visualization).

### <a name="visualization"></a> Visualization

Expand Down
207 changes: 29 additions & 178 deletions models.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,12 @@ def NB_model(df, conditions, confounders, model_dir, num_workers=4, count=5, err
p_value, log_likelihood, mus, sigmas, est_counts = result
diff_intron_dict[coord] = [p_value, log_likelihood, mus, sigmas]
pred_intron_dict[coord] = 'LO_DATA'
if p_value is not None and np.any(mus >= 1):
if p_value is not None:# and np.any(mus >= 1):
sig_coords.append(coord)
pred_intron_dict[coord] = 'OK'
if p_value == -1:
pred_intron_dict[coord] = 'NO_OPT'
else:
pred_intron_dict[coord] = 'OK'
if sample_est_count_option==True:
est_count_dict[coord]=est_counts

Expand All @@ -113,10 +116,7 @@ def batch_run_NB_model(ys, conditions, confounders, model_dir, count, aggressive
alt_model = pickle.load(open(Path(model_dir) / 'alt_NB_model.cov.pkl', 'rb'))
sample_mu_model = None
if sample_est_count_option:
if conditions.shape[1]==confounders.shape[1]:
sample_mu_model='raw_cnt'
else:
sample_mu_model = pickle.load(open(Path(model_dir) / 'alt_NB_model.cov.residual.pkl', 'rb'))
sample_mu_model = pickle.load(open(Path(model_dir) / 'alt_NB_model.cov.residual.pkl', 'rb'))
results = []
for y in ys:
results.append(run_NB_model(y, conditions, confounders, count, null_model, alt_model, sample_mu_model, aggressive_mode))
Expand Down Expand Up @@ -177,7 +177,7 @@ def run_NB_model(y, conditions, confounders, count, null_model, alt_model, sampl
try:
fit_null = null_model.optimizing(data=null_data_dict, as_vector=False, init_alpha=1e-5,init={'beta':[0 for _ in range(x_null_col_n)]})
fit_alt = alt_model.optimizing(data=alt_data_dict, as_vector=False, init_alpha=1e-5,init={'beta': [[0 for _ in range(K)] for _ in range(x_alt_col_n)]})
if sample_mu_model!=None and sample_mu_model!='raw_cnt':
if sample_mu_model!=None:
sample_mu_data_dict=alt_data_dict.copy()
sample_mu_data_dict['beta']=fit_alt['par']['beta']
sample_mu_data_dict['residual_sigma']=residual_sigma
Expand All @@ -194,18 +194,18 @@ def run_NB_model(y, conditions, confounders, count, null_model, alt_model, sampl
break

if i == max_optim_n:
return None, None, np.array(means), None, None
return -1, None, np.array(means), None, None
else:
log_likelihood = fit_alt['value'] - fit_null['value']
mus = fit_alt['par']['mu']
if np.any(mus >= 1) ==False:
return None, None, np.array(means), np.array(_vars), None
reciprocal_phi = fit_alt['par']['reciprocal_phi']
sigmas = mus + mus * mus * reciprocal_phi
p_value = 1 - chi2(3 * (K - 1)).cdf(2 * log_likelihood)
est_counts=None
if sample_mu_model=='raw_cnt':
est_counts=y
elif sample_mu_model!=None:
est_counts=(fit_sample_mu['par']['residual'] + z*sample_mu_data_dict['mu'] + z*np.dot(confounders.drop(confounders.columns[range(1,x_alt_col_n)],axis=1), pandas.DataFrame(sample_mu_data_dict['beta']).drop(range(1,x_alt_col_n)))).sum(axis=1)
if sample_mu_model!=None:
est_counts=(fit_sample_mu['par']['residual'] + z*sample_mu_data_dict['mu'] + z*np.dot(confounders, pandas.DataFrame(sample_mu_data_dict['beta']))).sum(axis=1)
est_counts=np.where(est_counts<0,0,est_counts)
return p_value, log_likelihood, mus, sigmas, est_counts

Expand All @@ -217,7 +217,7 @@ def init_null_NB_cov_model(model_dir):
int<lower=1> P;
int<lower=0> y[N];
real<lower=0> mu_raw;
matrix[N,P] x; // New covariate matrix with P columns
matrix[N,P] x;
}
parameters {
real<lower=0, upper=1> theta;
Expand All @@ -230,12 +230,13 @@ def init_null_NB_cov_model(model_dir):
mu ~ normal(mu_raw, sqrt(mu_raw/10) + 1e-4);
reciprocal_phi ~ normal(0,1);
reciprocal_phi ~ cauchy(0,1);
beta[1]~normal(0,sqrt(mu_raw) + 1e-4);
for (p in 2:P){
beta[p]~normal(0,sqrt(mu_raw) + 1e-4);
}
xb=x*beta;
for (n in 1:N) {
Expand Down Expand Up @@ -271,24 +272,26 @@ def init_alt_NB_cov_model(model_dir):
int<lower=0> y[N];
int<lower=0> z[N, K];
vector<lower=0>[K] mu_raw;
matrix[N,P] x; // New covariate matrix with P columns
matrix[N,P] x;
}
parameters {
real<lower=0, upper=1> theta[K];
vector<lower=0>[K] mu;
vector<lower=1e-4>[K] reciprocal_phi;
matrix[P,K] beta;
matrix[P,K] beta;
}
model {
matrix[N,K] xb;
matrix[N,K] xb;
beta[1]~normal(0,sqrt(mu_raw) + 1e-4);
for (p in 2:P){
beta[p]~normal(0,sqrt(mu_raw) + 1e-4);
}
for (p in 1:2){
beta[p]~normal(0,sqrt(mu_raw) + 1e-4);
}
for (p in 3:P){
beta[p]~normal(0,sqrt(mu_raw) + 1e-4);
}
mu ~ normal(mu_raw, sqrt(mu_raw/10)+1e-4);
reciprocal_phi ~ normal(0, 1);
reciprocal_phi ~ cauchy(0, 1);
xb=x*beta;
for (n in 1:N) {
vector[K] lps;
Expand Down Expand Up @@ -349,18 +352,19 @@ def init_alt_NB_cov_residual_model(model_dir):
vector[K] residual[N];
}
model {
matrix[N,K] xb;
matrix[N,K] xb;
xb=x*beta;
for (n in 1:N) {
vector[K] lps;
residual[n]~normal(0,10);
if (y[n] == 0){
for (k in 1:K){
real mu_pos;
residual[n]~normal(0,residual_sigma);
if ((mu[k]+xb[n,k]+residual[n][k])<=0){
mu_pos=0+1e-4;
Expand All @@ -377,7 +381,6 @@ def init_alt_NB_cov_residual_model(model_dir):
for (k in 1:K) {
real mu_pos;
residual[n]~normal(0,residual_sigma);
if ((mu[k]+xb[n,k]+residual[n][k])<=0){
mu_pos=0+1e-4;
Expand All @@ -399,85 +402,6 @@ def init_alt_NB_cov_residual_model(model_dir):
with open(file, 'wb') as f:
pickle.dump(model, f)


def init_null_NB_model(model_dir):
code = """
data {
int<lower=1> N;
int<lower=0> y[N];
real<lower=0> mu_raw;
}
parameters {
real<lower=0, upper=1> theta;
real<lower=0> mu;
real<lower=1e-4> reciprocal_phi;
}
model {
mu ~ normal(mu_raw, sqrt(mu_raw/10)+1e-4);
reciprocal_phi ~ cauchy(0., 5);
for (n in 1:N) {
if (y[n] == 0){
target += log_sum_exp(bernoulli_lpmf(1 | theta),
bernoulli_lpmf(0 | theta)
+ neg_binomial_2_lpmf(y[n] | mu+1e-4, 1./reciprocal_phi));
}
else{
target += bernoulli_lpmf(0 | theta) + neg_binomial_2_lpmf(y[n] | mu+1e-4, 1./reciprocal_phi);
}
}
}
"""
file = f'{model_dir}/null_NB_model.pkl'
extra_compile_args = ['-pthread', '-DSTAN_THREADS']
model = pystan.StanModel(model_code=code, extra_compile_args=extra_compile_args)
with open(file, 'wb') as f:
pickle.dump(model, f)


def init_alt_NB_model(model_dir):
code = """
data {
int<lower=1> N;
int<lower=1> K;
int<lower=0> y[N];
int<lower=0> z[N, K];
vector<lower=0>[K] mu_raw;
}
parameters {
real<lower=0, upper=1> theta[K];
vector<lower=0>[K] mu;
vector<lower=1e-4>[K] reciprocal_phi;
}
model {
mu ~ normal(mu_raw, sqrt(mu_raw/10)+1e-4);
reciprocal_phi ~ cauchy(0., 5);
for (n in 1:N) {
vector[K] lps;
if (y[n] == 0){
for (k in 1:K){
lps[k] = log_sum_exp(bernoulli_lpmf(1 | theta[k]),
bernoulli_lpmf(0 | theta[k])
+ neg_binomial_2_lpmf(y[n] | mu[k]+1e-4, 1./reciprocal_phi[k]));
lps[k] *= z[n][k];
}
}
else{
for (k in 1:K) {
lps[k] = bernoulli_lpmf(0 | theta[k]) + neg_binomial_2_lpmf(y[n] | mu[k]+1e-4, 1./reciprocal_phi[k]);
lps[k] *= z[n][k];
}
}
target += lps;
}
}
"""
file = f'{model_dir}/alt_NB_model.pkl'
extra_compile_args = ['-pthread', '-DSTAN_THREADS']
model = pystan.StanModel(model_code=code, extra_compile_args=extra_compile_args)
with open(file, 'wb') as f:
pickle.dump(model, f)


###############################################################################
def dm_add_q_values(diff_dm_group_dict, error_rate, method):
p_values = [v[1] if v[1] != None else -1 for v in diff_dm_group_dict.values()]
Expand All @@ -503,8 +427,8 @@ def get_splice_site_groups(intron_coords):
def DM_model(df, index_df, conditions, confounders, model_dir, num_workers=4, error_rate=0.05,
method='fdr_bh', batch_size=1000, group_filter=0, aggressive_mode=False, sample_psi_option=False, residual_sigma=10):

#_df = df.drop(['label'], axis=1)
_df = df[df['label'] == 1].drop(['label'], axis=1)
# per sample psis
_df_dmfilter = df[df['label'] == 1].drop(['label'], axis=1)
indices_dmfilter=_df_dmfilter.index.tolist()
_index_df_dmfilter = index_df.loc[index_df['index'].isin(indices_dmfilter)]
Expand Down Expand Up @@ -870,76 +794,3 @@ def init_DM_beta_reparam_cov_residual_model(model_dir):
with open(file, 'wb') as f:
pickle.dump(model, f)

def init_null_DM_model(model_dir):
code = """
functions {
real dirichlet_multinomial_lpmf(int[] y, vector alpha) {
real alpha_plus = sum(alpha);
return lgamma(alpha_plus) + sum(lgamma(alpha + to_vector(y)))
- lgamma(alpha_plus + sum(y)) - sum(lgamma(alpha));
}
}
data {
int<lower=1> N;
int<lower=1> M;
int<lower=0> y[N, M];
real<lower=0> conc_mu;
real<lower=0> conc_std;
}
parameters {
simplex[M] alpha;
real<lower=0> conc;
}
model {
conc ~ normal(conc_mu, conc_std);
for (n in 1:N) {
target += dirichlet_multinomial_lpmf(y[n] | conc * alpha);
}
}
"""
file = f'{model_dir}/null_DM_model.pkl'
extra_compile_args = ['-pthread', '-DSTAN_THREADS']
model = pystan.StanModel(model_code=code, extra_compile_args=extra_compile_args)
with open(file, 'wb') as f:
pickle.dump(model, f)


def init_alt_DM_model(model_dir):
code = """
functions {
real dirichlet_multinomial_lpmf(int[] y, vector alpha) {
real alpha_plus = sum(alpha);
return lgamma(alpha_plus) + sum(lgamma(alpha + to_vector(y)))
- lgamma(alpha_plus + sum(y)) - sum(lgamma(alpha));
}
}
data {
int<lower=1> N;
int<lower=1> M;
int<lower=1> K;
int<lower=0> y[N, M];
int<lower=0> z[N, K];
real<lower=0> conc_mu;
real<lower=0> conc_std;
}
parameters {
simplex[M] alpha[K];
real<lower=0> conc;
}
model {
conc ~ normal(conc_mu, conc_std);
for (n in 1:N) {
vector[K] lps;
for (k in 1:K){
lps[k] = dirichlet_multinomial_lpmf(y[n] | conc * alpha[k]);
lps[k] *= z[n][k];
}
target += lps;
}
}
"""
file = f'{model_dir}/alt_DM_model.pkl'
extra_compile_args = ['-pthread', '-DSTAN_THREADS']
model = pystan.StanModel(model_code=code, extra_compile_args=extra_compile_args)
with open(file, 'wb') as f:
pickle.dump(model, f)
10 changes: 6 additions & 4 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,9 +290,11 @@ def write_pred_intron_file(df, conditions, labels, pred_intron_dict, out_dir, es
_list += [','.join(np.take(y, i).astype(str).tolist()) for i in indices]
if est_count_dict!=None:
est_y = est_count_rows[i]
if None not in est_y:
if est_y.count(None) == len(est_y):
_list += ['None' for i in indices]
else:
est_y = np.around(est_y, 6)
_list += [','.join(np.take(est_y, i).astype(str).tolist()) for i in indices]
_list += [','.join(np.take(est_y, i).astype(str).tolist()) for i in indices]
f.write('\t'.join(_list) + '\n')


Expand All @@ -309,10 +311,10 @@ def write_diff_nb_intron_file(labels, diff_nb_intron_dict, out_dir, anno_info=No
f.write('\t'.join(_list) + '\n')
for coord, value in diff_nb_intron_dict.items():
p_value, log_likelihood, mus, sigmas, q_value = value
str_p_value = f"{p_value:.6g}" if p_value is not None else 'NA'
str_p_value = 'NA' if (p_value is None or p_value == -1) else f"{p_value:.6g}"
str_q_value = f"{q_value:.6g}" if q_value is not None else 'NA'
str_log_likelihood = f"{log_likelihood:.6g}" if log_likelihood is not None else 'NA'
status = 'TEST' if p_value is not None else 'NO_TEST'
status = 'NO_TEST' if (p_value is None or p_value == -1) else 'TEST'
gene_names = get_gene_names(anno_info, coord) if anno_info else '.'
_chr, strand, start, end = coord
_list = [_chr, str(start), str(end), strand, gene_names, status, str_log_likelihood, str_p_value, str_q_value]
Expand Down

0 comments on commit 4f1b825

Please sign in to comment.