In [1]:
import pandas as pd

from data_loading import (load_valid_isoform_clones,
    load_isoform_and_paralog_y2h_data, 
    load_y1h_pdi_data, 
    load_m1h_activation_data)


In [2]:
df = pd.read_excel('../data/internal/from_kaia/prioritized_BRCA_isos_to_clone.CDS_seqs.xlsx')
iso = load_valid_isoform_clones()
iso['cds_len'] = iso['cds'].str.len()

y2h = load_isoform_and_paralog_y2h_data()
y1h = load_y1h_pdi_data()
m1h = load_m1h_activation_data()

ppi_per_clone = (y2h.loc[(y2h['category'] == 'tf_isoform_ppis') &
         y2h['Y2H_result'].notnull()]
    .groupby(['ad_gene_symbol', 'ad_clone_acc'])
    .size())
clones_gte_2ppi = ppi_per_clone[ppi_per_clone >= 2].groupby('ad_gene_symbol').size()
tf_with_ppi_data = set(clones_gte_2ppi[clones_gte_2ppi >= 2].index)
df['at_least_2_ppis_for_two_isoforms'] = df['gene'].isin(tf_with_ppi_data)

y1h['any_pdi'] = y1h.iloc[:, 2:].any(axis=1)
tf_with_pdi_data = (y1h.groupby('tf').size() >= 2) & y1h.groupby('tf')['any_pdi'].any()
tf_with_pdi_data = set(tf_with_pdi_data[tf_with_pdi_data].index)
df['at_least_1_pdi'] = df['gene'].isin(tf_with_pdi_data)


df['at_least_2_m1h_data'] = df['gene'].isin(set(m1h.groupby('gene').size()[m1h.groupby('gene').size() >= 2].index))

iso['cds'].apply(lambda x: x[-3:] in {'TAG', 'TAA', 'TGA'}).value_counts()

def remove_stop_codon(s):
    if s[-3:] in {'TAG', 'TAA', 'TGA'}:
        return s[:-3]
    else:
        return s


iso['cds'] = iso['cds'].apply(remove_stop_codon)
df['seq_cds'] = df['seq_cds'].apply(remove_stop_codon)

In [3]:
# is the matching on CDS working?
df.loc[~df['seq_cds'].str.startswith('ATG'), :]

Unnamed: 0,index,gene,ctrl_id,primary_analysis,analysis_effect_size,analysis_padj,pick_status,iso_details,cloneable_cell_lines,best_cell_line,tpm_in_best_cell_line,ratio_in_best_cell_line,appris_status,removal_flag,all_sig_analyses,seq_cds,cds_len,at_least_2_ppis_for_two_isoforms,at_least_1_pdi,at_least_2_m1h_data
4,DMTF1:::GENCPID72720__NA__NA,DMTF1,,control,,,control,other,MCF7,MCF7,1.545120,0.07,,no flags,control,TGACAGCTGCTGCTCCTGCTTCTCCTGAACAGATTATTGTTCATGC...,530,False,True,True
9,DMTF1:::GENCPID67064__NA__NA,DMTF1,,control,,,control,other,"ZC, MDA, HCC",HCC,1.887490,0.13,,no flags,control,CCAGAACATTTGTTGAACACAAGTGATAATGTTACAGTGCAGTGTC...,537,False,True,True
19,ESR1:::GENCPID1248__NA__NA,ESR1,,control,,,control,none,none,HME,0.000000,,,no flags,control,CATAACGACTATATGTGTCCAGCCACCAACCAGTGCACCATTGATA...,323,False,False,False
48,WT1:::GENCPID1670__NA__NA,WT1,,control,,,control,principal,none,ZC,0.646284,0.21,PRINCIPAL:5,no flags,control,CTGGACTTCCTCTTGCTGCAGGACCCGGCTTCCACGTGTGTCCCGG...,1518,True,True,True
54,WT1:::GENCPID1669__NA__NA,WT1,,control,,,control,none,none,ZC,0.629601,0.21,ALTERNATIVE:2,no flags,control,CTGGACTTCCTCTTGCTGCAGGACCCGGCTTCCACGTGTGTCCCGG...,1509,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2232,ADNP2:::GENCPID70189__NA__NA,ADNP2,,HER2-enriched (no DE),-0.029588,0.116439,other,other,"MCF7, ZC",ZC,3.019590,0.26,,flag to remove: not in Sachi's high-conf. list,none,ATATTGGGCTTGACAGCTGCAAGGAGTTACTGAAGCCCGACAGGAA...,80,False,False,False
2248,HMG20A:::GENCPID68556__NA__NA,HMG20A,,Normal-like (no DE),0.113382,0.000037,other,other,MCF7,MCF7,1.192980,0.11,,flag to remove: not in Sachi's high-conf. list,none,AGCTCGGGAAGCAGAGCTCCGCCAGCTTCGCAAATCCAACATGGAG...,355,True,False,True
2250,GTF3A:::GENCPID1685__NA__NA,GTF3A,,Normal-like (no DE),0.000819,0.993808,other,principal,"MDA, HCC",MDA,25.458000,0.19,PRINCIPAL:1,no flags,none,CTGGATCCGCCGGCCGTGGTCGCCGAGTCGGTGTCGTCCTTGACCA...,1098,False,False,False
2269,TCF7L2:::GENCPID1349__NA__NA,TCF7L2,,bulk BRCA (no DE),0.000456,0.446407,other,other,MCF7,MCF7,1.064790,0.10,,no flags,none,ATAAAGAAACCTCTTAATGCATTCATGTTGTATATGAAGGAAATGA...,351,False,False,True


In [4]:
iso.loc[~iso['cds'].str.startswith('ATG'), :]

Unnamed: 0,gene,clone_acc,cds,aa_seq,num_aa,is_novel_isoform,clone_name,cds_len
57,EBF1,EBF1|2/3|06D12,AGGTTCTTCTTGAAATTTTTCCTCAAATGTAACCAAAATTGCCTAA...,RFFLKFFLKCNQNCLKNAGNPRDMRRFQVVVSTTVNVDGHVLAVSD...,408,True,EBF1-2,1224
58,EBF1,EBF1|3/3|06B12,AGGTTCTTCTTGAAATTTTTCCTCAAATGTAACCAAAATTGCCTAA...,RFFLKFFLKCNQNCLKNAGNPRDMRRFQVVVSTTVNVDGHVLAVSD...,407,False,EBF1-3,1221
478,TEAD3,TEAD3|1/2|10E03,ATAGCGTCCAACAGCTGGAACGCCAGCAGCAGCCCCGGGGAGGCCC...,IASNSWNASSSPGEAREDGPEGLDKGLDNDAEGVWSPDIEQSFQEA...,435,False,TEAD3-1,1305
479,TEAD3,TEAD3|2/2|10G04,ATAGCGTCCAACAGCTGGAACGCCAGCAGCAGCCCCGGGGAGGCCC...,IASNSWNASSSPGEAREDGPEGLDKGLDNDAEGVWSPDIEQSFQEA...,435,False,TEAD3-2,1305


In [5]:
# use this. Add tf gene level PPI + PDI + M1H data
df.loc[df['index'].apply(lambda x: x.split('_')[-1].startswith('TF'))
        & df['at_least_2_ppis_for_two_isoforms']
        & df['at_least_1_pdi']
        & df['at_least_2_m1h_data'], :]

Unnamed: 0,index,gene,ctrl_id,primary_analysis,analysis_effect_size,analysis_padj,pick_status,iso_details,cloneable_cell_lines,best_cell_line,tpm_in_best_cell_line,ratio_in_best_cell_line,appris_status,removal_flag,all_sig_analyses,seq_cds,cds_len,at_least_2_ppis_for_two_isoforms,at_least_1_pdi,at_least_2_m1h_data
53,WT1:::GENCPID22245__NA__TF1P0PID361,WT1,,control,,,control,brca-dominant,MDA,MDA,1.01984,0.51,,no flags,control,ATGGAGAAGGGTTACAGCACGGTCACCTTCGACGGGACGCCCAGCT...,867,True,True,True
68,MAX:::GENCPID53245__PACBIOPID40380__TF1P0PID889,MAX,,survival (w/ cell DE),-1.62183,0.5370295,other,"principal, brca-dominant","HME, MCF7, ZC, MDA, HCC",MDA,19.774901,0.48,PRINCIPAL:4,no flags,TNBC (w/ cell DE),ATGAGCGATAACGATGACATCGAGGTGGAGAGCGACGAAGAGCAAC...,483,True,True,True
70,MAX:::GENCPID53242__PACBIOPID40378__TF1P0PID886,MAX,,survival (w/ cell DE),-0.929807,0.7950435,other,other,"HME, MCF7, ZC, MDA, HCC",ZC,16.054631,0.37,ALTERNATIVE:1,no flags,"Luminal A (no DE), Luminal B (no DE), TNBC (no...",ATGAGCGATAACGATGACATCGAGGTGGAGAGCGACGCTGACAAAC...,456,True,True,True
71,MAX:::GENCPID53243__PACBIOPID40379__TF1P0PID888,MAX,,survival (w/ cell DE),1.767979,0.5187464,other,other,none,HME,1.77281,0.04,,no flags,none,ATGAGCGATAACGATGACATCGAGGTGGAGAGCGACGAAGAGCAAC...,405,True,True,True
74,MAX:::NA__NA__TF1P0PID887,MAX,,survival (w/ cell DE),0.903486,0.8099524,other,other,ZC,ZC,2.16528,0.05,,no flags,none,ATGAGCGATAACGATGACATCGAGGTGGAGAGCGACGCTGACAAAC...,261,True,True,True
76,MAX:::GENCPID53247__NA__TF1P0PID890,MAX,,survival (w/ cell DE),1.115621,0.73448,other,other,"HME, MDA, HCC",HCC,3.25723,0.07,,no flags,none,ATGAGCGATAACGATGACATCGAGGTGGAGAGCGACGAAGAGCAAC...,291,True,True,True
143,TEAD2:::GENCPID28155__PACBIOPID20724__TF1P0PID452,TEAD2,,survival (w/ cell DE),3.012326,0.130611,other,"principal, brca-dominant","HME, MCF7, MDA, HCC",MDA,4.403513,0.32,PRINCIPAL:3,no flags,none,ATGGGGGAACCCCGGGCTGGGGCCGCCCTGGACGATGGCAGCGGCT...,1344,True,True,True
151,MLX:::NA__NA__TF1P0PID996,MLX,,survival (w/ cell DE),3.470959,0.09233241,key iso,none,ZC,ZC,5.48623,0.05,,no flags,survival (w/ cell DE),ATGACGGAGCCGGGCGCCTCTCCCGAGGACCCTTGGGTCAAGGTGG...,721,True,True,True
156,MLX:::GENCPID60033__PACBIOPID45359__TF1P0PID995,MLX,,survival (w/ cell DE),1.796551,0.4946944,other,other,none,ZC,3.883,0.04,ALTERNATIVE:1,no flags,none,ATGACGGAGCCGGGCGCCTCTCCCGAGGACCCTTGGGTCAAGGTGG...,735,True,True,True
157,MLX:::NA__NA__TF1P0PID994,MLX,,survival (w/ cell DE),1.056256,0.74199,other,other,ZC,ZC,8.55852,0.08,,no flags,none,ATGACGGAGCCGGGCGCCTCTCCCGAGGACCCTTGGGTCAAGGTGG...,642,True,True,True


In [6]:
# WT1:::GENCPID22245__NA__TF1P0PID361 not matching CDS length in Kaia's table: 867, in isoform table: 864
a = iso.loc[iso['clone_acc'] == 'WT1|6/6|10G06']['cds'].values[0]
b = df.loc[df['index'] == 'WT1:::GENCPID22245__NA__TF1P0PID361', 'seq_cds'].values[0]
a

'ATGGAGAAGGGTTACAGCACGGTCACCTTCGACGGGACGCCCAGCTACGGTCACACGCCCTCGCACCATGCGGCGCAGTTCCCCAACCACTCATTCAAGCATGAGGATCCCATGGGCCAGCAGGGCTCGCTGGGTGAGCAGCAGTACTCGGTGCCGCCCCCGGTCTATGGCTGCCACACCCCCACCGACAGCTGCACCGGCAGCCAGGCTTTGCTGCTGAGGACGCCCTACAGCAGTGACAATTTATACCAAATGACATCCCAGCTTGAATGCATGACCTGGAATCAGATGAACTTAGGAGCCACCTTAAAGGGCCACAGCACAGGGTACGAGAGCGATAACCACACAACGCCCATCCTCTGCGGAGCCCAATACAGAATACACACGCACGGTGTCTTCAGAGGCATTCAGGATGTGCGGCGTGTGCCTGGAGTAGCCCCGACTCTTGTACGGTCGGCATCTGAGACCAGTGAGAAACGCCCCTTCATGTGTGCTTACCCAGGCTGCAATAAGAGATATTTTAAGCTGTCCCACTTACAGATGCACAGCAGGAAGCACACTGGTGAGAAACCATACCAGTGTGACTTCAAGGACTGTGAACGAAGGTTTTCTCGTTCAGACCAGCTCAAAAGACACCAAAGGAGACATACAGGTGTGAAACCATTCCAGTGTAAAACTTGTCAGCGAAAGTTCTCCCGGTCCGACCACCTGAAGACCCACACCAGGACTCATACAGGTAAAACAAGTGAAAAGCCCTTCAGCTGTCGGTGGCCAAGTTGTCAGAAAAAGTTTGCCCGGTCAGATGAATTAGTCCGCCATCACAACATGCATCAGAGAAACATGACCAAACTCCAGCTGGCGCTT'

In [7]:
iso.loc[iso['gene'] == 'NFIX']

Unnamed: 0,gene,clone_acc,cds,aa_seq,num_aa,is_novel_isoform,clone_name,cds_len
283,NFIX,NFIX|1/4|08H03,ATGTACTCCCCGTACTGCCTCACCCAGGATGAGTTCCACCCGTTCA...,MYSPYCLTQDEFHPFIEALLPHVRAFSYTWFNLQARKRKYFKKHEK...,441,False,NFIX-1,1358
284,NFIX,NFIX|2/4|08G06,ATGGATGAGTTCCACCCGTTCATCGAGGCACTGCTGCCTCACGTCC...,MDEFHPFIEALLPHVRAFSYTWFNLQARKRKYFKKHEKRMSKDEER...,433,False,NFIX-2,1334
285,NFIX,NFIX|3/4|08F05,ATGTACTCCCCGTACTGCCTCACCCAGGATGAGTTCCACCCGTTCA...,MYSPYCLTQDEFHPFIEALLPHVRAFSYTWFNLQARKRKYFKKHEK...,400,True,NFIX-3,1235
286,NFIX,NFIX|4/4|08D05,ATGTACTCCCCGTACTGCCTCACCCAGGATGAGTTCCACCCGTTCA...,MYSPYCLTQDEFHPFIEALLPHVRAFSYTWFNLQARKRKYFKKHEK...,353,True,NFIX-4,1059


## candidate genes for cancer vinyette

- MAX (lots of isoforms)
- MLX (lots of isoforms)
- NFIX?
- PPARG-1
    - data looks good...
- TEAD2-1
    - data looks good 
- WT1-6
    - we have a lot of data (already known splicing effects in cancer)
- ZBTB18
    - data looks good

In [8]:
df['seq_cds'].apply(lambda x: x[-3:] in {'TAG', 'TAA', 'TGA'}).value_counts()

False    2328
True        1
Name: seq_cds, dtype: int64

In [9]:
pd.merge(df, iso, left_on='seq_cds', right_on='cds')

Unnamed: 0,index,gene_x,ctrl_id,primary_analysis,analysis_effect_size,analysis_padj,pick_status,iso_details,cloneable_cell_lines,best_cell_line,tpm_in_best_cell_line,ratio_in_best_cell_line,appris_status,removal_flag,all_sig_analyses,seq_cds,cds_len_x,at_least_2_ppis_for_two_isoforms,at_least_1_pdi,at_least_2_m1h_data,gene_y,clone_acc,cds,aa_seq,num_aa,is_novel_isoform,clone_name,cds_len_y
0,DMTF1:::NA__NA__TF1P0PID985,DMTF1,,control,,,control,other,"MCF7, MDA",MCF7,1.437810,0.06,,no flags,control,ATGACTGCAACCACAGAAGTAGCAGATGATGAGGTTACTGAGGGGA...,1983,False,True,True,DMTF1,DMTF1|3/5|05B01,ATGACTGCAACCACAGAAGTAGCAGATGATGAGGTTACTGAGGGGA...,MTATTEVADDEVTEGTVTQIQILQNEQLDEISPLGNEEVSAVSQAW...,661,True,DMTF1-3,1983
1,KLF6:::GENCPID19594__PACBIOPID14061__TF1P0PID303,KLF6,KLF6:::KLF6,control,,,control,"principal, brca-dominant, control iso","HME, MCF7, ZC, MDA, HCC",MDA,96.764500,0.66,PRINCIPAL:1,no flags,"control, Luminal A (no DE), TNBC (no DE), bulk...",ATGGACGTGCTCCCCATGTGCAGCATCTTCCAGGAGCTCCAGATCG...,852,False,False,False,KLF6,KLF6|1/2|11D07,ATGGACGTGCTCCCCATGTGCAGCATCTTCCAGGAGCTCCAGATCG...,MDVLPMCSIFQELQIVHETGYFSALPSLEEYWQQTCLELERYLQSE...,283,False,KLF6-1,852
2,TP53:::NA__NA__TF1P0PID219,TP53,,control,,,control,other,"HME, MCF7, ZC, MDA, HCC",MDA,22.154800,0.17,,no flags,control,ATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCA...,1062,True,False,False,TP53,TP53|2/2|02F03,ATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCA...,MDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPRVAPAPAAPTPAAP...,354,False,TP53-2,1062
3,TP53:::NA__NA__TF1P0PID334,TP53,,control,,,control,other,"HME, MCF7, ZC, MDA, HCC",MDA,16.583800,0.13,,no flags,control,ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTC...,1179,True,False,False,TP53,TP53|1/2|11G10,ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTC...,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,393,False,TP53-1,1179
4,MAX:::GENCPID53245__PACBIOPID40380__TF1P0PID889,MAX,,survival (w/ cell DE),-1.621830,5.370295e-01,other,"principal, brca-dominant","HME, MCF7, ZC, MDA, HCC",MDA,19.774901,0.48,PRINCIPAL:4,no flags,TNBC (w/ cell DE),ATGAGCGATAACGATGACATCGAGGTGGAGAGCGACGAAGAGCAAC...,483,True,True,True,MAX,MAX|1/6|08G02,ATGAGCGATAACGATGACATCGAGGTGGAGAGCGACGAAGAGCAAC...,MSDNDDIEVESDEEQPRFQSAADKRAHHNALERKRRDHIKDSFHSL...,160,False,MAX-1,480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,ZNF511:::GENCPID48453__NA__TF1P0PID827,ZNF511,,bulk BRCA (no DE),-0.000973,7.807209e-01,other,other,none,ZC,3.601710,0.04,ALTERNATIVE:2,no flags,none,ATGCAGTTGCCCCCCGCGCTGTGCGCCCGCCTCGCTGCGGGGCCCG...,789,False,False,True,ZNF511,ZNF511|1/3|02B04,ATGCAGTTGCCCCCCGCGCTGTGCGCCCGCCTCGCTGCGGGGCCCG...,MQLPPALCARLAAGPGAAEPLPVERDPAAGAAPFRFVARPVRFPRE...,262,False,ZNF511-1,786
134,PLAGL1:::GENCPID13783__NA__TF1P0PID179,PLAGL1,,bulk BRCA (no DE),-0.020090,1.592444e-01,other,"principal, brca-dominant",HME,HME,2.661955,0.56,PRINCIPAL:1,no flags,none,ATGGCCACGTTCCCCTGCCAGTTATGTGGCAAGACGTTCCTCACCC...,1392,False,False,True,PLAGL1,PLAGL1|1/2|03H08,ATGGCCACGTTCCCCTGCCAGTTATGTGGCAAGACGTTCCTCACCC...,MATFPCQLCGKTFLTLEKFTIHNYSHSRERPYKCVQPDCGKAFVSR...,463,False,PLAGL1-1,1389
135,ZNF692:::NA__NA__TF1P0PID175,ZNF692,,bulk BRCA (no DE),-0.000698,8.743089e-01,other,other,none,ZC,1.513540,0.03,,no flags,none,ATGGCTTCCTCCCCGGCGGTGGACGTGTCCTGCAGGCGGCGGGAGA...,1554,False,False,True,ZNF692,ZNF692|1/3|06D07,ATGGCTTCCTCCCCGGCGGTGGACGTGTCCTGCAGGCGGCGGGAGA...,MASSPAVDVSCRRREKRRQLDARRSKCRIRLGGHMEQWCLLKERLG...,518,True,ZNF692-1,1554
136,ZNF692:::NA__NA__TF1P0PID174,ZNF692,,bulk BRCA (no DE),0.004773,6.312533e-01,other,other,HME,HME,2.244680,0.10,,no flags,none,ATGGCTTCCTCCCCGGCGGTGGACGTGTCCTGCAGGCGGCGGGAGA...,1464,False,False,True,ZNF692,ZNF692|2/3|06B09,ATGGCTTCCTCCCCGGCGGTGGACGTGTCCTGCAGGCGGCGGGAGA...,MASSPAVDVSCRRREKRRQLDARRSKCRIRLGGHMEQWCLLKERLG...,488,True,ZNF692-2,1464


In [10]:
iso['gene'].nunique()

310

In [11]:
df = pd.read_csv('../data/external/cancer_gene_census.csv')
cgc = set(df['Gene Symbol'].unique())
cgc.intersection(set(iso['gene'].unique()))

{'CREB1',
 'CTCF',
 'DDIT3',
 'EBF1',
 'ELK4',
 'ETV4',
 'ETV6',
 'FLI1',
 'FOXO3',
 'GATA1',
 'GATA2',
 'GATA3',
 'GLI1',
 'HEY1',
 'HIF1A',
 'HMGA1',
 'HOXA9',
 'KLF4',
 'KLF6',
 'LEF1',
 'MAX',
 'MITF',
 'NFE2L2',
 'PATZ1',
 'PAX5',
 'PAX7',
 'PAX8',
 'PBX1',
 'POU5F1',
 'PPARG',
 'PRDM16',
 'PRRX1',
 'RARA',
 'REL',
 'RUNX1',
 'SIX1',
 'SMAD3',
 'SMAD4',
 'STAT3',
 'TCF12',
 'TCF7L2',
 'TP53',
 'TP63',
 'WT1',
 'ZBTB16'}

In [12]:
df = df.loc[df['Gene Symbol'].isin(iso['gene'].unique()),
            ['Gene Symbol',
            'Tumour Types(Somatic)',
            'Tissue Type',
            'Molecular Genetics', 'Role in Cancer', 'Mutation Types']]

In [13]:
df.shape

(45, 6)

In [14]:
df['n_cloned_isoforms'] = df['Gene Symbol'].map(iso.groupby('gene').size())

In [15]:
df['has_novel_isoform'] = df['Gene Symbol'].map(iso.groupby('gene')['is_novel_isoform'].any())

In [16]:
(df['n_cloned_isoforms'] >= 2).sum()

36