# Exome Analysis and Primer Design
##### Author: Kim Roggenbuck

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd, sys
import seaborn as sns

In [2]:
#Read the data into a dataframe
df = pd.read_csv("Variants.csv", delimiter=';')
print('There are', len(df.index), 'rows to start with.')

There are 67654 rows to start with.


In [3]:
# STEP 1: filter out genetic changes with a frequency > 1%

df = df.dropna(subset=['DBSNP_FREQ_ALT'])
df2 = df[df['DBSNP_FREQ_ALT'].str.contains(pat ='0,01000')]
df3 = df[df['DBSNP_FREQ_ALT'].str.contains(pat ='0,00')]

df = df2.append(df3)
print('There are', len(df.index), 'rows left at this step.')
df.head()
#df.to_csv("df.csv")

There are 2789 rows left at this step.


Unnamed: 0,TYPE,CHROMOSOME,POS_START,POS_END,LENGTH,NUCL_REF,NUCL_ALT,ZYGOSITY,GENOTYPE,FREQ_ALT_DX32,REFGENE_GENE,REFGENE_TRANSCRIPT,REFGENE_LOC,REFGENE_CPOS,REFGENE_PPOS,DBSNP_NAME,DBSNP_FREQ_ALT,DBNSFP_ENSEMBL_PROTEIN
0,V,chr1,762273,762273,1,G,A,heterozygous ref-alt1,G/A,857142857.0,LINC00115,NR_024321,3'UTR,"NR_024321:c,*630C>T",,rs3115849,27,
1,V,chr1,866422,866422,1,C,T,heterozygous ref-alt1,C/T,,SAMD11,NM_152486,exonic,"NM_152486:c,258C>T",86,rs139210662,53,"SAMD11:p,(S10F);SAMD11:p,(S10F);SAMD11:p,(S10F)"
3,V,chr1,879276,879276,1,A,G,heterozygous ref-alt1,A/G,,SAMD11,NM_152486,intronic,"NM_152486:c,1801-12A>G",,rs115454328,81,
25,V,chr1,906302,906302,1,C,T,heterozygous ref-alt1,C/T,14705882.0,PLEKHN1,NM_032129;NM_001160184,exonic;exonic,"NM_032129:c,528C>T;NM_001160184:c,528C>T",176;176,rs41300090,39,
32,V,chr1,910903,910903,1,T,C,heterozygous ref-alt1,T/C,1.0,C1orf170,NR_027693,3'UTR,"NR_027693:c,*2716A>G",,rs4970429,68,


In [4]:
# STEP 1.5: filter out non autosomal chromosomes (meaning X and Y)
# normall I'd have done this as the first or second step in order to disregard as many non fitting variants as possible, 
# but I figured since our numbers after each step are compared I would leave it until the end.
df = df[df['CHROMOSOME'] != 'chrX']
df = df[df['CHROMOSOME'] != 'chrY']
print('There are', len(df.index), 'rows left at this step.')
df.to_csv("ddd.csv")

There are 2727 rows left at this step.


In [5]:
# STEP 2: filter out rows with wrong location
df2 = df[(df['REFGENE_LOC'].str.contains(pat ='exonic'))]
df4 = df[df['REFGENE_LOC'].str.contains(r'(?:\s|^)intronic-splice(?:\s|$)')]
df5 = df[df['REFGENE_LOC'].str.contains(r'(?:\s|^)exonic-splice(?:\s|$)')]
df = df2.append(df4)
df = df.append(df5)

print('There are', len(df.index), 'rows left at this step.')

There are 1479 rows left at this step.


In [6]:
# STEP 3: filter out rows that do not affect proteins (frameshift (= I and D, meaning insertion and deletion) 
# or don't create new protein (= where last column not empty))
df = df[(df['TYPE'] == 'I') | (df['TYPE'] == 'D') | ((df['TYPE'] == 'V') & (df['DBNSFP_ENSEMBL_PROTEIN'].notna()))]
print('There are', len(df.index), 'rows left at this step.')

There are 854 rows left at this step.


In [7]:
# STEP 4: homozygous vs heterozygous
dfHetero = df[df['ZYGOSITY'] != 'homozygous alt1']
dfHomo = df[df['ZYGOSITY'] == 'homozygous alt1']

dfHetero = dfHetero[dfHetero.duplicated(subset=['REFGENE_GENE'], keep=False)]
df = dfHomo.append(dfHetero)
print('There are', len(df.index), 'rows left at this step.')

# kick out same ones:
df = df.drop_duplicates()
print('There are', len(df.index), 'rows left at this step.')

df.to_csv("HomoHeteroNew2.csv")

There are 181 rows left at this step.
There are 167 rows left at this step.


While the previous steps concerned the variant, the next steps involve the gene. These were done manually.