In [1]:
import math
import string
import itertools
from collections import defaultdict

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

from data_loading import (load_isoform_and_paralog_y2h_data,
                          load_y2h_isoform_data,
                          load_y2h_paralogs_additional_data,
                          load_paralog_pairs)

In [2]:
y2h = load_isoform_and_paralog_y2h_data()
y2h_para = load_y2h_paralogs_additional_data()
y2h_para = y2h_para.loc[y2h_para['at_least_2_isoforms'] & y2h_para['at_least_2_partners'], :]
pairs = load_paralog_pairs()

# could also try with valid clones dataset
y2h['at_least_2_isoforms'] = y2h['ad_gene_symbol'].map(y2h.loc[y2h['category'] == 'tf_isoform_ppis', :]
                                                          .groupby('ad_gene_symbol')
                                                          ['ad_clone_acc']
                                                          .nunique() >= 2)
y2h['at_least_2_partners'] = y2h['ad_gene_symbol'].map(y2h.loc[y2h['category'] == 'tf_isoform_ppis', :]
                                                          .groupby('ad_gene_symbol')
                                                          ['db_gene_symbol']
                                                          .nunique() >= 2)
y2h_iso = y2h.loc[(y2h['category'] == 'tf_isoform_ppis') &
                  y2h['at_least_2_isoforms'] &
                  y2h['at_least_2_partners'],
                  :]

In [3]:
filtered_pairs = y2h_para.loc[y2h_para['at_least_2_isoforms'] & 
             y2h_para['at_least_2_partners'],
             ['ad_gene_symbol', 'paired_tf_gene']].drop_duplicates()
filtered_pairs = {frozenset([row['ad_gene_symbol'], g]) for _i, row in filtered_pairs.iterrows() for g in row['paired_tf_gene'].split('|')}

In [4]:
pairs['additional_tested'] = pairs.apply(lambda x: frozenset([x['tf_gene_a'], x['tf_gene_b']]) in filtered_pairs,
            axis=1)

In [5]:
# check for pairs where one set of partners is a subset of the other
iso_partners = y2h_iso.groupby('ad_gene_symbol')['db_gene_symbol'].apply(set)
pairs['partners_a'] = pairs['tf_gene_a'].map(iso_partners)
pairs['partners_b'] = pairs['tf_gene_b'].map(iso_partners)
pairs['same_tested_partners'] = (pairs['partners_a'] == pairs['partners_b'])

In [6]:
pairs.loc[pairs['additional_tested'] | pairs['same_tested_partners'],
          ['tf_gene_a', 'tf_gene_b', 'is_paralog_pair']].to_csv('../cache/paralog_pairs_filtered.tsv', sep='\t', index=False)

In [7]:
pairs.loc[pairs['additional_tested'] | pairs['same_tested_partners'],
          ['tf_gene_a', 'tf_gene_b', 'is_paralog_pair']]['is_paralog_pair'].value_counts()

True     84
False    12
Name: is_paralog_pair, dtype: int64

In [8]:
non_zero_iso = (set(y2h_iso.loc[y2h_iso['score'] == '1', 'ad_clone_acc'].unique())
                .union(
                    set(y2h_para.loc[y2h_para['score'] == '1', 'ad_clone_acc'].unique())
                ))

In [9]:
y2h_iso['non_zero_iso'] = y2h_iso['ad_clone_acc'].isin(non_zero_iso)
y2h_para['non_zero_iso'] = y2h_para['ad_clone_acc'].isin(non_zero_iso)

In [10]:
y2h_iso['at_least_2_non_zero_isoforms'] = (y2h_iso['ad_gene_symbol']
                                            .map(y2h_iso.loc[y2h_iso['non_zero_iso'], :]
                                                  .groupby('ad_gene_symbol')
                                                  ['ad_clone_acc']
                                                  .nunique() >= 2))

In [11]:
y2h_iso['at_least_1_positive_per_partner'] = (y2h_iso.groupby(['ad_gene_symbol', 'db_gene_symbol'])
                                           ['score']
                                          .transform(lambda row: (row == '1').any()))
# and at least two partners after excluding those without a postiive
y2h_iso['at_least_1_positive_per_partner'] = (y2h_iso['at_least_1_positive_per_partner'] &
                                         y2h_iso['ad_gene_symbol'].map(
                                             y2h_iso.loc[y2h_iso['at_least_1_positive_per_partner'],
                                                     :].groupby('ad_gene_symbol')['db_gene_symbol'].nunique() >= 2))

In [12]:
(y2h_iso['at_least_1_positive_per_partner'] &
 y2h_iso['at_least_2_non_zero_isoforms']).sum()

3993

In [13]:
non_zero_gene_ppis = set((y2h_iso.loc[y2h_iso['at_least_1_positive_per_partner'] &
                                y2h_iso['at_least_2_non_zero_isoforms'],
                                ['ad_gene_symbol', 'db_gene_symbol']]
                                .drop_duplicates()
                                .apply(lambda x: x['ad_gene_symbol'] + '_' + x['db_gene_symbol'],
                                        axis=1)).values)
y2h_para['matches_non_zero_pair'] = y2h_para.apply(lambda x: any((g + '_' + x['db_gene_symbol']) in non_zero_gene_ppis
                             for g in x['paired_tf_gene'].split('|')),
               axis=1)

In [14]:
(y2h_para['matches_non_zero_pair'] & y2h_para['non_zero_iso']).sum()

2533

In [15]:
print(y2h_iso.shape[0] + y2h_para.shape[0], 'total pairs')
print((y2h_iso['non_zero_iso'].sum() + y2h_para['non_zero_iso'].sum()),
      'pairs, excluding isoforms with no PPIs')

9758 total pairs
7470 pairs, excluding isoforms with no PPIs


In [16]:
columns = ['ad_orf_id', 'ad_clone_acc', 'ad_gene_symbol', 'db_orf_id', 'db_gene_symbol']
df = pd.concat([y2h_iso.loc[:, columns],
                y2h_para.loc[:, columns]])
gene_sizes = (df.groupby('ad_gene_symbol')[['ad_clone_acc',
        'db_gene_symbol']]
        .nunique()
        .rename(columns={'ad_clone_acc': 'n_isoforms',
                                                     'db_gene_symbol': 'n_partners'})
                                                     .sort_values(['n_isoforms', 'n_partners'],
                                                                  ascending=False))
df['ad_clone_acc'] = df['ad_clone_acc'].apply(lambda x: x.split('|')[0] + '-' + x.split('|')[1].split('/')[0])
df = df.sort_values(['ad_gene_symbol' , 'ad_clone_acc', 'db_gene_symbol'])
if df.duplicated().any():
    raise UserWarning('Unexpected duplicate rows in table')
gene_sizes.head()
orf_id_map = y2h.drop_duplicates(['ad_clone_acc']).set_index('ad_clone_acc')['ad_orf_id'].to_dict()
orf_id_map = {x.split('|')[0] + '-' + x.split('|')[1].split('/')[0]: v for x, v in orf_id_map.items()}
orf_id_map.update(y2h.drop_duplicates(['db_gene_symbol']).set_index('db_gene_symbol')['db_orf_id'].to_dict())

In [17]:
class Plate:
    def __init__(self, n_rows, n_columns, empty_name='empty'):
        self.n_rows = n_rows
        self.n_columns = n_columns
        self.empty_name = empty_name
        self.grid = [[empty_name for i in range(n_columns)] for j in range(n_rows)]

    def add_matrix(self, matrix, pos_top_left, transpose=False):
        n_rows = len(matrix)
        n_columns = len(matrix[0])
        if n_rows + pos_top_left[0] > self.n_rows or n_columns + pos_top_left[1] > self.n_columns:
            raise ValueError('Does not fit: {} rows {} columns, top left position: {} \n{}'.format(n_rows,
                                                                  n_columns,
                                                                  pos_top_left,
                                                                  self))
        for i, row in enumerate(matrix):
            row_index = i + pos_top_left[0]
            for j, well in enumerate(row):
                column_index = j + pos_top_left[1]
                if self.grid[row_index][column_index] != self.empty_name:
                    raise ValueError('Well already occupied: {} {}\n{}'.format(row_index, column_index, self))
                self.grid[row_index][column_index] = well

    def lowest_unoccupied_row(self):
        for i in range(self.n_rows):
            if self.grid[i] == [self.empty_name] * self.n_columns:
                return i
        return None

    def leftmost_unoccupied_column(self):
        for i in range(self.n_columns):
            if [row[i] for row in self.grid] == [self.empty_name] * self.n_rows:
                return i
        return None

    def is_empty(self):
        return all(all(x == self.empty_name for x in row) for row in self.grid)

    def is_full(self):
        return all(all(x != self.empty_name for x in row) for row in self.grid)

    def row_is_empty(self, row_index):
        return all(x == self.empty_name for x in self.grid[row_index])

    def column_is_empty(self, column_index):
        return all(row[column_index] == self.empty_name for row in self.grid)

    def empty_wells(self):
        return {string.ascii_uppercase[i] + str(j + 1).zfill(2) for i, row in enumerate(self.grid) 
                                                                for j, x in enumerate(row) 
                                                                if x == self.empty_name}

    def add_empty_row_below(self):
        self.grid.append([self.empty_name] * self.n_columns)
        self.n_rows += 1

    def empty_rectangles_from_bottom_right(self):
        """Starting from the bottom right corner, all different rectangles of empty space,
        which are not contained within another. Assumes plate is filled from top left.
        
            Returns [(int, int)]: number of rows, number of columns
        
        """
        if self.is_empty():
            return [(self.n_rows, self.n_columns)]
        if self.is_full():
            return []

        def count_empty_cells_from_right(row):
            for i in range(1, self.n_columns + 1):
                if row[-i] != self.empty_name:
                    return i - 1
            else:
                return self.n_columns

        rectangles = []
        n_columns = count_empty_cells_from_right(self.grid[-1])  #sum(x == self.empty_name for x in self.grid[-1])
        for row_index in reversed(range(self.n_rows)):
            n_empty_cells = count_empty_cells_from_right(self.grid[row_index])  #sum(x == self.empty_name for x in self.grid[row_index])
            if n_empty_cells == 0:
                break
            if n_empty_cells < n_columns:
                rectangles.insert(0, ((self.n_rows - row_index) - 1,  n_columns))
                n_columns = n_empty_cells
        else:
            rectangles.insert(0, (self.n_rows,  n_columns))
        return rectangles

    def xo_picture(self):
        print('\n'.join('|'.join('O' if cell == self.empty_name else 'X' for cell in row) for row in self.grid))

    def __repr__(self):
        return '\n'.join('|'.join(row) for row in self.grid)

In [18]:
def gene_fits_on_this_plate(plate, pair_matrix):
    if plate.is_empty():
        return True
    n_row_gene = len(pair_matrix)
    n_col_gene = len(pair_matrix[0])
    if n_col_gene > plate.n_columns or n_row_gene > plate.n_rows:
        return False  # only start genes too large to fit on a single row on an empty plate

    for n_row_empty_spot, n_col_empty_spot in plate.empty_rectangles_from_bottom_right():
        if  n_col_gene <= n_col_empty_spot and n_row_gene <= n_row_empty_spot:
            return True
    return False


def add_gene_to_plates(plates, pair_matrix):
    row_max = plates[-1].n_rows
    column_max = plates[-1].n_columns
    n_row_gene = len(pair_matrix)
    n_col_gene = len(pair_matrix[0])
    if n_row_gene > plates[-1].n_rows:  # rotate matrix for genes with too many isoforms to fit on a plate
        row_max_transpose = row_max + 1
        pair_matrix = [[pair_matrix[i][j] for i in range(len(pair_matrix))] for j in range(len(pair_matrix[0]))]
        n_row_gene = len(pair_matrix)
        n_cols = len(pair_matrix[0])
        n_rows_split = [row_max_transpose] * (n_row_gene // row_max_transpose) + ([n_row_gene % row_max_transpose] if n_row_gene % row_max_transpose > 0 else [])
        for i, n_rows in enumerate(n_rows_split):
            plates[-1].add_empty_row_below()
            matrix_subset = pair_matrix[i * row_max_transpose:i * row_max_transpose + n_rows]
            plates[-1].add_matrix(matrix_subset,
                                    pos_top_left=(0, 0))
            plates.append(Plate(n_rows=row_max, n_columns=column_max))
    elif n_col_gene > column_max:
        row_count = 0
        for i, n_cols in enumerate([column_max] * (n_col_gene // column_max) + ([n_col_gene % column_max] if n_col_gene % column_max > 0 else [])):
                matrix_subset = [row[i * column_max:i * column_max + n_cols] for row in pair_matrix]
                if row_count + n_row_gene > row_max:
                    plates.append(Plate(n_rows=row_max, n_columns=column_max))
                    row_count = 0
                plates[-1].add_matrix(matrix_subset,
                                      pos_top_left=(row_count, 0))
                row_count += n_row_gene
    else:
        for n_row_empty_spot, n_col_empty_spot in plates[-1].empty_rectangles_from_bottom_right():
            if  n_col_gene <= n_col_empty_spot and n_row_gene <= n_row_empty_spot:
                plates[-1].add_matrix(pair_matrix,
                                      pos_top_left=(plates[-1].n_rows - n_row_empty_spot,
                                                    plates[-1].n_columns - n_col_empty_spot))
                break
        else:
            raise UserWarning('could not add to plate')


def solve_plate_layout(genes, row_max=7, column_max=12):
    """
    """
    #plates = [Plate(n_rows=7, n_columns=12)]  # start with one empty plate
    plates = []
    unallocated = dict(sorted(genes.items(),
                              key=lambda x: (len(x[1][0]), len(x[1])),
                              reverse=True)).copy()


    sizes = (pd.DataFrame([[k, len(v), len(v[0])] for k, v in unallocated.items()],
                         columns=['tf_gene_name', 'n_rows', 'n_columns'])
                        .sort_values(['n_columns', 'n_rows'], ascending=False))
    # start with largest 4, pair with 3's
    matched = []
    for i, row4 in sizes.loc[(sizes['n_rows'] == 4) & (sizes['n_columns'] >= 24), :].iterrows():
        col_count = 0
        paired = []
        for j, row3 in (sizes.loc[(sizes['n_rows'] == 3) &
                                (sizes['n_columns'] >= 24) &
                                    ~sizes['tf_gene_name'].isin([three for four, threes in matched for three in threes]), :]
                            .iterrows()):
            if row3['n_columns'] + col_count < row4['n_columns']:
                col_count += row3['n_columns']
                paired.append(row3['tf_gene_name'])
        if len(paired) > 0:
            matched.append((row4['tf_gene_name'], paired))
    for gene_4rows, genes_3rows in matched:
        matrix4 = unallocated[gene_4rows]
        matrix3 = [[x for gene in genes_3rows for x in unallocated[gene][row_index]] for row_index in range(3)]
        for i in range(math.ceil(len(matrix4[0]) / column_max)):
            plates.append(Plate(n_rows=7, n_columns=12))
            plates[-1].add_matrix([row[i * column_max:(i + 1) * column_max] for row in matrix4],
                                  pos_top_left=(0, 0))
            plates[-1].add_matrix([row[i * column_max:(i + 1) * column_max] for row in matrix3],
                                  pos_top_left=(4, 0))
        for tf_gene_name in [gene_4rows] + genes_3rows:
            del unallocated[tf_gene_name]


    while len(unallocated) > 0:
        for tf_gene_name, pair_matrix in unallocated.items():
            if gene_fits_on_this_plate(plates[-1], pair_matrix):
                add_gene_to_plates(plates, pair_matrix)
                del unallocated[tf_gene_name]
                break
        else:  # if no gene fits, start a new empty plate
            plates.append(Plate(n_rows=7, n_columns=12))   
    return plates


gene_matrices = {}
for gene in df['ad_gene_symbol'].unique():
    gene_matrices[gene] = []
    for i, isoform in enumerate(list(df.loc[df['ad_gene_symbol'] == gene, 'ad_clone_acc'].unique()) + ['empty-AD']):
        gene_matrices[gene].append([])
        for partner in df.loc[df['ad_gene_symbol'] == gene, 'db_gene_symbol'].unique():
            gene_matrices[gene][i].append(isoform + '/' + partner)
plates = solve_plate_layout(gene_matrices)
for plate in plates:
    if plate.n_rows == 7:
        plate.add_empty_row_below()




##### merge some of the plates
n_col_used = {i: p.leftmost_unoccupied_column() for i, p in enumerate(plates) 
              if p.leftmost_unoccupied_column() is not None
              and p.row_is_empty(7)}
to_merge = []
count = 0
group = []
for plate_idx, n_col in n_col_used.items():
    if n_col + count <= 12:
        group.append(plate_idx)
        count += n_col
    else:
        if len(group) > 1:
            to_merge.append(group)
        count = 0
        group = []
else:
    if len(group) > 1:
        to_merge.append(group)


def merge_two_plates(plate_a, plate_b):
    grid_b = [[cell for cell in row if cell != plate_b.empty_name] for row in plate_b.grid]
    grid_b = [row for row in grid_b if len(row) > 0]
    plate_a.add_matrix(grid_b, pos_top_left=(0, plate_a.leftmost_unoccupied_column()))


for group in to_merge:
    for plate_idx in group[1:]:
        merge_two_plates(plates[group[0]], plates[plate_idx])
for plate_idx in sorted([i for group in to_merge for i in group[1:]], reverse=True):
    del plates[plate_idx]



# Lit-BM and RRS pairs
litbm_rrs = y2h.loc[y2h['category'].isin(['rrs_isoforms', 'lit_bm_isoforms']), ['ad_clone_acc', 'db_gene_symbol', 'category']].drop_duplicates().copy()
litbm_rrs['ad_clone_acc'] = litbm_rrs['ad_clone_acc'].apply(lambda x: x.split('|')[0] + '-' + x.split('|')[1].split('/')[0])
seed = 307272992
litbm_rrs['pair'] = litbm_rrs['ad_clone_acc'] + '/' + litbm_rrs['db_gene_symbol']
litbm_rrs['already_tested'] = litbm_rrs['pair'].isin((df['ad_clone_acc'] + '/' + df['db_gene_symbol']).values)
litbm_rrs_pairs = litbm_rrs.loc[~litbm_rrs['already_tested'], 'pair'].to_list()
np.random.seed(seed)
np.random.shuffle(litbm_rrs_pairs)
# TODO: add empty-AD for lit-bm and rrs
litbm_rrs_pairs_with_empty_AD = [x if b else 'empty-AD/' + x.split('/')[1] for x in litbm_rrs_pairs for b in (True, False)]
litbm_rrs_pairs = []
for x in litbm_rrs_pairs_with_empty_AD:
    if x not in litbm_rrs_pairs:
        litbm_rrs_pairs.append(x)

print(len(litbm_rrs_pairs), 'Lit-BM / RRS pairs + empty-AD controls')

# increment a separate empty-well code for each empty-well pattern of 7 by 12
def new_code_iter():
    return itertools.cycle(itertools.product([False, True], repeat=6))

empty_well_code_h = defaultdict(new_code_iter)
empty_well_code_v = itertools.cycle(itertools.product([False, True], repeat=6))
for plate in plates:
    if plate.row_is_empty(7):
        code = next(empty_well_code_h[frozenset({x for x in plate.empty_wells() if not x.startswith('H')})])
        plate.add_matrix([['control-' + str(i) for i in range(1, 7)] +
                          [litbm_rrs_pairs.pop() if b and len(litbm_rrs_pairs) > 0 else 'empty' for b in code]],
                         (7, 0))
    elif plate.column_is_empty(9) and plate.column_is_empty(10) and plate.column_is_empty(11):
        plate.add_matrix([[litbm_rrs_pairs.pop() if len(litbm_rrs_pairs) > 0 else 'empty'] for j in range(8)],
                         pos_top_left=(0, 9))
        plate.add_matrix([[litbm_rrs_pairs.pop() if b and len(litbm_rrs_pairs) > 0 else 'empty'] for b in next(empty_well_code_v)] +
                         [[litbm_rrs_pairs.pop() if len(litbm_rrs_pairs) > 0 else 'empty'] for __i in range(2)],
                         (0, 10))
        plate.add_matrix([['control-' + str(i)] for i in range(1, 7)] +
                         [[litbm_rrs_pairs.pop() if len(litbm_rrs_pairs) > 0 else 'empty'] for __i in range(2)],
                         (0, 11))

if len(litbm_rrs_pairs) > 0:
    print(len(litbm_rrs_pairs), 'Lit-BM / RRS pairs left')

if not all([p.n_rows == 8 and p.n_columns == 12 for p in plates]):
    raise UserWarning('Some wrong sized plates')

with open('../output/plate_arrangement.txt', 'w') as f:
    f.write('\n\n\n'.join([str(p) for p in plates]))


def plates_to_table(plates):
    data = []
    # reverse so we go from small gene matrices to large
    for i, plate in enumerate(reversed(plates)):
        for j, row in enumerate(plate.grid):
            for k, pair in enumerate(row):
                if pair == 'empty':
                    ad = np.nan
                    db = np.nan
                elif pair.startswith('control-'):
                    ad = pair
                    db = pair
                else:
                    ad, db = pair.split('/')
                data.append((i + 1,
                             string.ascii_uppercase[j] + str(k + 1).zfill(2),
                             ad,
                             orf_id_map.get(ad, np.nan),
                             db,
                             orf_id_map.get(db, np.nan)))
    df = pd.DataFrame(data, columns=['test_pla', 'test_pos', 'ad_clone_acc', 'ad_orf_id', 'db_gene_symbol', 'db_orf_id'])
    # ORF ID as Int with missing values (otherwise will be float by default)
    df['ad_orf_id'] = df['ad_orf_id'].astype('Int64')
    df['db_orf_id'] = df['db_orf_id'].astype('Int64')
    return df


plates_to_table(plates).to_csv('../output/plate_arrangement.tsv', sep='\t', index=False)

print(len(plates), 'plates')

n_empty_well_codes = len({frozenset(p.empty_wells()) for p in plates})
if n_empty_well_codes == len(plates):
    print('Success! Each plate has its own unique empty well code')
else:
    print(len(plates) - n_empty_well_codes, 'duplicates empty well codes')

795 Lit-BM / RRS pairs + empty-AD controls
34 Lit-BM / RRS pairs left
189 plates
Success! Each plate has its own unique empty well code


In [19]:
# check that the output file is OK
out = pd.read_csv('../output/plate_arrangement.tsv', sep='\t')
out['ad_orf_id'] = out['ad_orf_id'].astype('Int64')
out['db_orf_id'] = out['db_orf_id'].astype('Int64')
# check number of plates
print(out['test_pla'].nunique(), 'plates')
if out['test_pla'].isnull().any() or out['test_pos'].isnull().any():
    raise UserWarning('Unexpected nulls')
if out.duplicated(['test_pla', 'test_pos']).any():
    raise UserWarning('Unexpected duplicates')
if not (out['test_pla'].value_counts() == 96).all():
    raise UserWarning('Expected 96 wells per plate')
if not out['test_pos'].isin({l + str(i).zfill(2) for l in string.ascii_uppercase[:8] for i in range(1, 13)}).all():
    raise UserWarning('Invalid position')
# check pairs are same as input
if not (df['ad_orf_id'].astype(str) + '/' + df['db_orf_id'].astype(str)).isin(
            out['ad_orf_id'].astype(str) + '/' + out['db_orf_id'].astype(str)).all():
    raise UserWarning('Missing pairs!')
if (out['ad_clone_acc'].notnull() & out['ad_orf_id'].isnull() &
    (out['ad_clone_acc'] != 'empty-AD')
    & (out['ad_clone_acc'] != 'control-1')
    & (out['ad_clone_acc'] != 'control-2')
    & (out['ad_clone_acc'] != 'control-3')
    & (out['ad_clone_acc'] != 'control-4')
    & (out['ad_clone_acc'] != 'control-5')
    & (out['ad_clone_acc'] != 'control-6')).any():
    raise UserWarning('missing ORF ID for some ADs')
if (out['db_gene_symbol'].notnull() & out['db_orf_id'].isnull()
    & (out['ad_clone_acc'] != 'control-1')
    & (out['ad_clone_acc'] != 'control-2')
    & (out['ad_clone_acc'] != 'control-3')
    & (out['ad_clone_acc'] != 'control-4')
    & (out['ad_clone_acc'] != 'control-5')
    & (out['ad_clone_acc'] != 'control-6')).any():
    raise UserWarning('missing ORF ID for some DBs')

189 plates


In [33]:
out.isnull().sum()

test_pla             0
test_pos             0
ad_clone_acc      3676
ad_orf_id         7812
db_gene_symbol    3676
db_orf_id         4810
ad_gene_symbol    7812
dtype: int64

In [34]:
out.head()

Unnamed: 0,test_pla,test_pos,ad_clone_acc,ad_orf_id,db_gene_symbol,db_orf_id,ad_gene_symbol
0,1,A01,ZIC3-1,101257,AQP1,1231,ZIC3
1,1,A02,ZIC3-1,101257,CATSPER1,1769,ZIC3
2,1,A03,ZIC3-1,101257,GCC1,7306,ZIC3
3,1,A04,ZIC3-1,101257,HSD3B7,4278,ZIC3
4,1,A05,ZIC3-1,101257,KRTAP9-3,71609,ZIC3


In [21]:
plates[-1]

ZIC3-1/AQP1|ZIC3-1/CATSPER1|ZIC3-1/GCC1|ZIC3-1/HSD3B7|ZIC3-1/KRTAP9-3|ZIC3-1/MAPK9|ZIC3-1/NME7|ZIC3-1/PIN1|ZIC3-1/PITX1|ZIC3-1/PRKAA2|ZIC3-1/STK36|ZIC3-1/SUFU
ZIC3-2/AQP1|ZIC3-2/CATSPER1|ZIC3-2/GCC1|ZIC3-2/HSD3B7|ZIC3-2/KRTAP9-3|ZIC3-2/MAPK9|ZIC3-2/NME7|ZIC3-2/PIN1|ZIC3-2/PITX1|ZIC3-2/PRKAA2|ZIC3-2/STK36|ZIC3-2/SUFU
ZIC3-3/AQP1|ZIC3-3/CATSPER1|ZIC3-3/GCC1|ZIC3-3/HSD3B7|ZIC3-3/KRTAP9-3|ZIC3-3/MAPK9|ZIC3-3/NME7|ZIC3-3/PIN1|ZIC3-3/PITX1|ZIC3-3/PRKAA2|ZIC3-3/STK36|ZIC3-3/SUFU
empty-AD/AQP1|empty-AD/CATSPER1|empty-AD/GCC1|empty-AD/HSD3B7|empty-AD/KRTAP9-3|empty-AD/MAPK9|empty-AD/NME7|empty-AD/PIN1|empty-AD/PITX1|empty-AD/PRKAA2|empty-AD/STK36|empty-AD/SUFU
empty|empty|empty|empty|empty|empty|empty|empty|empty|empty|empty|empty
empty|empty|empty|empty|empty|empty|empty|empty|empty|empty|empty|empty
empty|empty|empty|empty|empty|empty|empty|empty|empty|empty|empty|empty
control-1|control-2|control-3|control-4|control-5|control-6|empty|empty|empty-AD/TRIM39|empty|empty|ZNF302-5/TRIM39

In [22]:
out['test_pos'].str.slice(1,).astype(int)

0         1
1         2
2         3
3         4
4         5
         ..
18139     8
18140     9
18141    10
18142    11
18143    12
Name: test_pos, Length: 18144, dtype: int64

In [23]:
# TODO: this is no longer quite correct after the merging of plates

# instead: for each gene, take first and last plate as the intervals and
# then merge the overlapping ones


# break points and ordering of plates
out['ad_gene_symbol'] = out['ad_clone_acc'].apply(lambda x: np.nan if pd.isnull(x) or x.startswith('control') or x.startswith('empty') else x.split('-')[0])

# removing the control rows
tf_genes_per_plate = []
for plate_idx in out['test_pla'].unique():
    iso_a1 = out.loc[(out['test_pla'] == plate_idx) &
                     (out['test_pos'] == 'A01'), 'ad_clone_acc'].values[0]
    iso_b1 = out.loc[(out['test_pla'] == plate_idx) &
                    (out['test_pos'] == 'B01'), 'ad_clone_acc'].values[0]
    iso_a2 = out.loc[(out['test_pla'] == plate_idx) &
                    (out['test_pos'] == 'A02'), 'ad_clone_acc'].values[0]
    is_horizontal_stacked = ((iso_a1.split('-')[0] == iso_b1.split('-')[0]) and
                             (iso_a1.split('-')[1] != iso_b1.split('-')[1]))
    is_vertical_stacked = ((iso_a1.split('-')[0] == iso_a2.split('-')[0]) and
                           (iso_a1.split('-')[1] != iso_a2.split('-')[1]))
    if is_horizontal_stacked and is_vertical_stacked:
        raise UserWarning('did not work. Plate: {}'.format(plate_idx))
    if is_horizontal_stacked: 
        gs = out.loc[(out['test_pla'] == plate_idx) &
                     (~out['test_pos'].str.startswith('H')),
                     'ad_gene_symbol'].dropna().unique()
        for g in gs:
            tf_genes_per_plate.append((g, plate_idx))
    elif is_vertical_stacked:  # vertical stacked
        gs = out.loc[(out['test_pla'] == plate_idx) &
                     (out['test_pos'].str.slice(1,).astype(int) <= 9),
                     'ad_gene_symbol'].dropna().unique()
        for g in gs:
            tf_genes_per_plate.append((g, plate_idx))
    else:
        raise UserWarning('this did not work. Plate: {}'.format(plate_idx))
tf_genes_per_plate = pd.DataFrame(data=tf_genes_per_plate, columns=['gene', 'plate'])
bounds = []
for g in tf_genes_per_plate['gene'].unique():
    b = (tf_genes_per_plate.loc[tf_genes_per_plate['gene'] == g, 'plate'].min(),
         tf_genes_per_plate.loc[tf_genes_per_plate['gene'] == g, 'plate'].max())
    if b[0] != b[1]:
        bounds.append(b)
# remove duplicates
bounds = [b for i, b in enumerate(bounds) if b not in bounds[:i]]
# merge overlapping
merged = []
bounds = list(sorted(bounds, key=lambda x: x[0]))
start, stop = bounds[0]
for i in range(1, len(bounds)):
    if bounds[i][0] > stop:
        merged.append((start, stop))
        start, stop = bounds[i]
    else:
        stop = max(stop, bounds[i][1])
else:
    merged.append((start, stop))

with open('../output/plates_sharing_same_gene.txt', 'w') as f:
    f.write('\n'.join(['{}-{}'.format(*x) for x in merged]))

In [24]:
with open('../output/empty_well_codes.txt', 'w') as f:
    f.write('plate\tempty wells\n')
    for i, plate in enumerate(plates):
        f.write(str(i + 1) + '\t' + ' '.join(sorted(plate.empty_wells())) + '\n')

In [25]:
n_empty_wells = sum(len(p.empty_wells()) for p in plates)
print('{} empty wells out of {} {:.0f}%'.format(n_empty_wells,
                len(plates) * 96,
                n_empty_wells / (len(plates) * 96) * 100))

3676 empty wells out of 18144 20%


In [26]:
from collections import Counter
Counter(len(p.empty_wells()) for p in plates)

Counter({6: 2,
         5: 11,
         4: 23,
         3: 23,
         2: 10,
         37: 3,
         24: 3,
         12: 4,
         54: 1,
         33: 1,
         21: 1,
         1: 1,
         20: 1,
         82: 1,
         31: 1,
         65: 1,
         41: 6,
         36: 2,
         30: 2,
         29: 6,
         28: 14,
         27: 15,
         18: 3,
         11: 3,
         10: 4,
         9: 2,
         13: 1,
         42: 5,
         26: 6,
         35: 1,
         39: 3,
         17: 7,
         25: 1,
         16: 3,
         34: 1,
         23: 5,
         15: 1,
         22: 1,
         40: 4,
         44: 2,
         78: 1,
         45: 1,
         68: 1,
         48: 1})