In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_rows', 0)

import pickle
import sys
import os
import json

if ".." not in sys.path:
    sys.path.append("..")
    
from typing import Dict, Tuple, Sequence

In [2]:
from pp5 import OUT_DIR

PVAL_THRESH = 0.05*2

PREC_OUT_DIR = OUT_DIR.joinpath("prec-collected")
RESULTS_TAG = "pointwise_cdist-bs_200-n_2-k_500"
INTERMEDIATE_DIR = "_intermediate_"

DATASET_DIRS = (
    PREC_OUT_DIR.joinpath("20210102_115634-aida-ex_EC-src_EC-r18"),
    PREC_OUT_DIR.joinpath("20210102_112030-aida-ex_EC-src_HS-r18"),
    PREC_OUT_DIR.joinpath("20210102_112750-aida-ex_EC-src_ALL-r18"),
)


### Dataset stats

In [3]:
# Maps from dataset tag to a metadata dict
datasets_meta = {}

# Maps from dataset tag to it's path
dataset_result_dirs: Dict[str, Path] = {}

for ds_dir in DATASET_DIRS:
    meta_path = ds_dir.joinpath("meta.json")
    with open(meta_path, "r") as f:
        ds_meta = json.load(f)
    tag = ds_meta.pop("out_tag")
    
    datasets_meta[tag] = ds_meta
    dataset_result_dirs[tag] = ds_dir.joinpath("results", RESULTS_TAG, INTERMEDIATE_DIR)
    assert dataset_result_dirs[tag].is_dir()

In [4]:
df_datasets_meta = pd.DataFrame(datasets_meta)
df_datasets_meta

Unnamed: 0,ex_EC-src_EC,ex_EC-src_HS,ex_EC-src_ALL
id,20210102_115634-aida-ex_EC-src_EC,20210102_112030-aida-ex_EC-src_HS,20210102_112750-aida-ex_EC-src_ALL
hostname,aida,aida,aida
async_timeout,60,60,60
create_zip,False,False,False
out_dir,out/prec-collected/20210102_115634-aida-ex_EC-src_EC,out/prec-collected/20210102_112030-aida-ex_EC-src_HS,out/prec-collected/20210102_112750-aida-ex_EC-src_ALL
query,(('X-Ray Resolution' LESS_OR_EQUAL '1.8') AND ('Method' EXACT_MATCH 'X-RAY DIFFRACTION')) AND ('Expression System' CONTAINS_PHRASE 'Escherichia Coli') AND ('Source Organism Taxonomy ID' EXACT_MATCH '562'),(('X-Ray Resolution' LESS_OR_EQUAL '1.8') AND ('Method' EXACT_MATCH 'X-RAY DIFFRACTION')) AND ('Expression System' CONTAINS_PHRASE 'Escherichia Coli') AND ('Source Organism Taxonomy ID' EXACT_MATCH '9606'),(('X-Ray Resolution' LESS_OR_EQUAL '1.8') AND ('Method' EXACT_MATCH 'X-RAY DIFFRACTION')) AND ('Expression System' CONTAINS_PHRASE 'Escherichia Coli')
prec_init_args,{},{},{}
prec_out_dir,out/prec,out/prec,out/prec
write_csv,False,False,False
n_query_results,2318,11442,40006


In [5]:
dataset_result_dirs

{'ex_EC-src_EC': PosixPath('/Users/aviv/dev/phd/proteins/out/prec-collected/20210102_115634-aida-ex_EC-src_EC-r18/results/pointwise_cdist-bs_200-n_2-k_500/_intermediate_'),
 'ex_EC-src_HS': PosixPath('/Users/aviv/dev/phd/proteins/out/prec-collected/20210102_112030-aida-ex_EC-src_HS-r18/results/pointwise_cdist-bs_200-n_2-k_500/_intermediate_'),
 'ex_EC-src_ALL': PosixPath('/Users/aviv/dev/phd/proteins/out/prec-collected/20210102_112750-aida-ex_EC-src_ALL-r18/results/pointwise_cdist-bs_200-n_2-k_500/_intermediate_')}

### Codon vs. Codon

In [6]:
def load_results(
    dataset_result_dirs: Dict[str, Path], result_names: Sequence[str]
) -> Dict[str, Sequence[Dict]]:
    
    # maps dataset tag -> [results_dict1, results_dict2, ...]
    dataset_loaded_results: Dict[str, Sequence[Dict]] = {}

    for ds_tag, ds_dir in dataset_result_dirs.items():

        loaded_results = []

        for result_name in result_names:

            result_path = ds_dir.joinpath(result_name)
            assert result_path.is_file(), f"{ds_tag=} {result_name=}"

            with open(result_path, 'rb') as f:
                p = pickle.load(f)
                # remove the length-1 array
                p = {k: v[0] for k,v in p.items()}

            loaded_results.append(p)

        dataset_loaded_results[ds_tag] = loaded_results
    
    return dataset_loaded_results

In [7]:
CODON_DIHEDRAL_PVALS = "codon-dihedral-pvals.pkl"
CODON_DIHEDRAL_T2S = "codon-dihedral-t2s.pkl"

# maps dataset tag -> (pval results, t2 results)
dataset_codon_dists: Dict[str, Sequence[Dict]] = load_results(
    dataset_result_dirs,
    result_names=[CODON_DIHEDRAL_PVALS, CODON_DIHEDRAL_T2S]
)

In [8]:
from pp5.codons import SYN_CODON_IDX_UNIQ, AA_CODONS, ACIDS

SYN_CODON_IDX = np.array(SYN_CODON_IDX_UNIQ)

results = {}

for ds_tag, (ds_pvals, ds_t2s) in dataset_codon_dists.items():
    
    assert ds_pvals.keys() == ds_t2s.keys()
    
    for ss_type in ds_pvals.keys():
        
        ds_ss_pvals = ds_pvals[ss_type]
        ds_ss_t2 = ds_t2s[ss_type]
        
        syn_codon_pvals = ds_ss_pvals[tuple(zip(*SYN_CODON_IDX))]
        syn_significant_idx_flat = syn_codon_pvals <= PVAL_THRESH
        syn_significant_idx = SYN_CODON_IDX[syn_significant_idx_flat]
        
        results[(ds_tag,ss_type)] = [
            {
                "codon1": AA_CODONS[aac1],
                 "codon2": AA_CODONS[aac2],
                 "pval": ds_ss_pvals[aac1, aac2],
                 "t2": ds_ss_t2[aac1, aac2]
            }
            for aac1, aac2 in syn_significant_idx
        ]

In [9]:
import itertools as it

def results_to_df(
    results: Dict[Tuple, Dict], index_col_names: Sequence[str]
):
    index = [k for k, v in results.items() for _ in v]
    index = pd.MultiIndex.from_tuples(index, names=index_col_names)
    
    data = it.chain(*results.values())
    df = pd.DataFrame(data=data, index=index)
    df = df.sort_values(by=index_col_names)
    return df
    

In [10]:
index_col_names = ["Dataset", "SS"]

df_pvals = results_to_df(results, index_col_names=index_col_names)

df_pvals = df_pvals.sort_values(by=[*index_col_names,"pval"])
df_pvals.to_csv(f"out/pointwise_pvals_{PVAL_THRESH}.csv", float_format="%.3f")
df_pvals

Unnamed: 0_level_0,Unnamed: 1_level_0,codon1,codon2,pval,t2
Dataset,SS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ex_EC-src_ALL,HELIX,L-CTC,L-TTG,0.034,3.853600
ex_EC-src_ALL,HELIX,R-CGA,R-CGT,0.050,3.353872
ex_EC-src_ALL,HELIX,S-TCC,S-TCT,0.052,3.306922
ex_EC-src_ALL,HELIX,H-CAT,H-CAT,0.070,2.767044
ex_EC-src_ALL,HELIX,T-ACA,T-ACG,0.078,3.313507
ex_EC-src_ALL,HELIX,S-AGT,S-TCC,0.094,2.509614
ex_EC-src_ALL,HELIX,S-TCA,S-TCC,0.098,2.497135
ex_EC-src_ALL,HELIX,L-CTA,L-CTC,0.098,2.422890
ex_EC-src_ALL,OTHER,A-GCA,A-GCC,0.008,6.154318
ex_EC-src_ALL,OTHER,L-TTA,L-TTG,0.012,5.840215


### AA vs. Codon

In [11]:
AAC_DIHEDRAL_PVALS = "aac-dihedral-pvals.pkl"
AAC_DIHEDRAL_T2S = "aac-dihedral-t2s.pkl"

# maps dataset tag -> (pval results, t2 results)
dataset_aac_dists: Dict[str, Sequence[Dict]] = load_results(
    dataset_result_dirs,
    result_names=[AAC_DIHEDRAL_PVALS, AAC_DIHEDRAL_T2S]
)

In [12]:
{k: np.nanmin(v) for k, v in dataset_aac_dists['ex_EC-src_ALL'][0].items()}

{'HELIX': 0.102, 'OTHER': 0.036, 'SHEET': 0.09, 'TURN': 0.048}

In [13]:
from pp5.codons import ACIDS, AA_CODONS

AA_AAC = [
    (aa, codon) for aa in ACIDS for codon in AA_CODONS if aa==codon[0]
]

results_aac = {}

for ds_tag, (ds_pvals, ds_t2s) in dataset_aac_dists.items():
    assert ds_pvals.keys() == ds_t2s.keys()
    for ss_type in ds_pvals.keys():
        ds_ss_pvals = ds_pvals[ss_type]
        ds_ss_t2s = ds_t2s[ss_type]
    
        # [(aa1, codon1), (aa2, codon2), ...]
        aac_significant_idx = list(zip(
            # where returns ((aa1, aa2, ...), (codon1, codon2, ...))
            *np.where(ds_ss_pvals <= PVAL_THRESH)
        ))
        
        results_aac[(ds_tag, ss_type)] = [
            {
                "AA": ACIDS[aa],
                 "codon": AA_CODONS[codon],
                 "pval": ds_ss_pvals[aa, codon],
                 "t2": ds_ss_t2s[aa, codon],
            }
            for aa, codon in aac_significant_idx
        ]

In [14]:
df_aac_pvals = results_to_df(results_aac, index_col_names=index_col_names)
df_aac_pvals = df_aac_pvals.sort_values(by=[*index_col_names,"pval"])
df_aac_pvals.to_csv(f"out/pointwise_aac_pvals_{PVAL_THRESH}.csv", float_format="%.3f")
df_aac_pvals

Unnamed: 0_level_0,Unnamed: 1_level_0,AA,codon,pval,t2
Dataset,SS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ex_EC-src_ALL,OTHER,A,A-GCT,0.036,3.709765
ex_EC-src_ALL,OTHER,T,T-ACA,0.08,2.728593
ex_EC-src_ALL,SHEET,S,S-AGC,0.09,2.175861
ex_EC-src_ALL,TURN,S,S-TCT,0.048,2.861528
ex_EC-src_ALL,TURN,K,K-AAA,0.072,2.719128
ex_EC-src_ALL,TURN,S,S-TCA,0.072,2.757495
ex_EC-src_ALL,TURN,H,H-CAC,0.08,2.487195
ex_EC-src_ALL,TURN,V,V-GTA,0.092,2.785985
ex_EC-src_EC,HELIX,L,L-CTC,0.0,7.525026
ex_EC-src_EC,HELIX,T,T-ACG,0.042,3.925222


In [15]:
for ds_tag, (ds_pvals, ds_t2s) in dataset_codon_dists.items():
    for ss_type in ds_pvals.keys():
        ds_ss_pvals = ds_pvals[ss_type]
        
        print(f"{ss_type=} min={np.nanmin(ds_ss_pvals)}, max={np.nanmax(ds_ss_pvals)}")

ss_type='HELIX' min=0.0, max=1.0
ss_type='OTHER' min=0.0, max=1.0
ss_type='SHEET' min=0.0, max=1.0
ss_type='TURN' min=0.0, max=1.0
ss_type='HELIX' min=0.0, max=1.0
ss_type='OTHER' min=0.0, max=1.0
ss_type='SHEET' min=0.0, max=1.0
ss_type='TURN' min=0.0, max=1.0
ss_type='HELIX' min=0.0, max=0.9700000286102295
ss_type='OTHER' min=0.0, max=1.0
ss_type='SHEET' min=0.0, max=1.0
ss_type='TURN' min=0.0, max=1.0
