In [1]:
import itertools

import pandas as pd

from datasets import (load_CCSB_YI1_search_space,
                      load_ito_search_space,
                      load_non_dubious_orfs,
                      load_YeRI_search_space)

yu = load_CCSB_YI1_search_space()
valid_orfs = load_non_dubious_orfs()
yu = pd.DataFrame(data=itertools.combinations(yu.loc[yu['orf_name'].isin(valid_orfs), 'orf_name'].sort_values().values,
                       2),
                  columns=['orf_name_a', 'orf_name_b'])

yi1_ss = load_CCSB_YI1_search_space()
yi1_ss = yi1_ss.loc[yi1_ss['orf_name'].isin(valid_orfs)]

ito_ads, ito_dbs = load_ito_search_space()
ito_ss = pd.DataFrame(data=[(orf, orf in ito_ads, orf in ito_dbs) for orf in ito_ads.union(ito_dbs)],
                      columns=['orf_name', 'screened_as_AD', 'screened_as_DB'])
ito_ss = ito_ss.loc[ito_ss['orf_name'].isin(valid_orfs)]

yeri_ss = load_YeRI_search_space()
yeri_ss = yeri_ss.loc[yeri_ss['orf_name'].isin(valid_orfs)]

Uetz-screen we don't have the exact cloned ORFs but we have in their paper that 5,345 out of 6,144 where successfully cloned as both AD and DB

In [2]:
n_orfs = len(valid_orfs)
ss_total_orientation_independent = n_orfs**2 / 2 + n_orfs/2
ss_total_orientation_dependent = n_orfs**2

print(f"{n_orfs} total protein-coding yeast ORFs")
print(f"{int(ss_total_orientation_independent)} pairwise combinations ignoring orientation and including homodimers")
print()

for ss, name in [(ito_ss, 'ITO'), (yi1_ss, 'CCSB-YI1'), (yeri_ss, 'YeRI')]:
    print(
        f"{name} total ORFs screened: {ss.shape[0]}\n",
        f"{name} screened as AD: {ss['screened_as_AD'].sum()}\n",
        f"{name} screened as DB: {ss['screened_as_DB'].sum()}",
    )
    pw = len({frozenset([ad, db]) 
              for ad in ss.loc[ss['screened_as_AD'], 'orf_name'].values 
              for db in ss.loc[ss['screened_as_DB'], 'orf_name'].values})
    print(f"{name} {pw} pairwise combinations screened in at least one orientation\n",
          f"{name} {pw / ss_total_orientation_independent:.2%} of total\n",
          f"{name} {ss['screened_as_AD'].sum() * ss['screened_as_DB'].sum()} orientation dependent pairwise combinations\n",
          f"{name} {(ss['screened_as_AD'].sum() * ss['screened_as_DB'].sum()) / ss_total_orientation_dependent:.2%} of total\n",

          )
    print()

5883 total protein-coding yeast ORFs
17307786 pairwise combinations ignoring orientation and including homodimers

ITO total ORFs screened: 5178
 ITO screened as AD: 5011
 ITO screened as DB: 4660
ITO 13259982 pairwise combinations screened in at least one orientation
 ITO 76.61% of total
 ITO 23351260 orientation dependent pairwise combinations
 ITO 67.47% of total


CCSB-YI1 total ORFs screened: 4883
 CCSB-YI1 screened as AD: 4659
 CCSB-YI1 screened as DB: 3425
CCSB-YI1 10835475 pairwise combinations screened in at least one orientation
 CCSB-YI1 62.60% of total
 CCSB-YI1 15957075 orientation dependent pairwise combinations
 CCSB-YI1 46.11% of total


YeRI total ORFs screened: 5854
 YeRI screened as AD: 5700
 YeRI screened as DB: 4778
YeRI 16135484 pairwise combinations screened in at least one orientation
 YeRI 93.23% of total
 YeRI 27234600 orientation dependent pairwise combinations
 YeRI 78.69% of total


