# Setup
Import libraries and load data

In [49]:
import pandas as pd
import ipywidgets as widgets
import IPython as ipy

Load data

In [50]:
donors = pd.read_csv("data/donors_clean.csv")
snps = pd.read_csv("data/snp_clean.csv")
clinvar = pd.read_csv("data/clinvar_variant_summary.csv", sep = "\t")
genes = pd.read_csv("data/genes.csv")
icgc_top_genes = pd.read_csv("data/icgc_top_genes.csv")

# Data Summary


In [51]:
print(pd.DataFrame.describe(donors))
print(donors.head)

       icgc_donor_id donor_sex icgc_specimen_id   specimen_type
count            364       364              364             363
unique            89         2              364               7
top          DO32835      male          SP75993  primary tumour
freq              12       187                1             193
<bound method NDFrame.head of     icgc_donor_id donor_sex icgc_specimen_id           specimen_type
0         DO32817      male          SP69519          primary tumour
1         DO32817      male          SP69517        peripheral blood
2         DO32817      male          SP77315          tumour (other)
3         DO32819      male          SP69521        peripheral blood
4         DO32819      male          SP77323          tumour (other)
..            ...       ...              ...                     ...
359       DO34905      male          SP77203          primary tumour
360       DO34905      male          SP77211          primary tumour
361       DO34961      male  

In [52]:
genes.head

<bound method NDFrame.head of      hgnc_symbol  ensembl_gene_id
0            CFH  ENSG00000000971
1         SEMA3F  ENSG00000001617
2           CFTR  ENSG00000001626
3         ANKIB1  ENSG00000001629
4            BAD  ENSG00000002330
...          ...              ...
7688     HMGB2P1  ENSG00000267736
7689         NaN  ENSG00000267748
7690         NaN  ENSG00000267765
7691         NaN  ENSG00000267784
7692      SMIM22  ENSG00000267795

[7693 rows x 2 columns]>

In [53]:
icgc_top_genes.head

<bound method NDFrame.head of   hgnc_symbol  ensembl_gene_id mutated_from_allele mutated_to_allele  \
0        KRAS  ENSG00000133703                   C                 T   
1        KRAS  ENSG00000133703                   C                 A   
2        KRAS  ENSG00000133703                   C                 G   
3        TP53  ENSG00000141510                   C                 T   
4        TP53  ENSG00000141510                   C                 T   
5        TP53  ENSG00000141510                   T                 C   
6        TP53  ENSG00000141510                   C                 A   
7        TP53  ENSG00000141510                   T                 C   
8        KRAS  ENSG00000133703                   T                 G   
9        TP53  ENSG00000141510                   C                 T   

              mutation_type         consequence_type  chromosome     start  \
0  single base substitution                 missense          12  25398284   
1  single base substi

Find the most common genes in the snps table



In [54]:
snps_with_genes = pd.merge(snps, genes, how = 'left', left_on = 'gene_affected', right_on = 'ensembl_gene_id')
snps_with_genes.groupby(by = 'hgnc_symbol').size().sort_values(ascending = False)

hgnc_symbol
TP53         1000
TTN-AS1       351
KRAS          325
CDKN2A        315
TTN           247
             ... 
RN7SL751P       1
RN7SL731P       1
RN7SL716P       1
RN7SL650P       1
A1BG            1
Length: 6743, dtype: int64

# Create widgets to find hits in the snps table

In [55]:
dropdown_gene = widgets.Dropdown(
    options=['KRAS', 'TP53', 'TTN-AS1', 'CDKN2A', 'TTN'],
    value='KRAS',
    description='Gene:',
    disabled=False,
)
output_gene = widgets.Output()

def filter_by_gene(change):
    output_gene.clear_output()
    with output_gene:
        filtered_data = snps_with_genes[snps_with_genes.hgnc_symbol == change.new]
        display(str(len(filtered_data.index)) + ' Total variants')
        display(str(filtered_data.icgc_donor_id.nunique()) + ' Unique Donors')
        display(filtered_data)
        
dropdown_gene.observe(filter_by_gene, names = 'value')

display(dropdown_gene)

Dropdown(description='Gene:', options=('KRAS', 'TP53', 'TTN-AS1', 'CDKN2A', 'TTN'), value='KRAS')

In [56]:
display(output_gene)

Output()