In [1]:
import csv
import requests
import re
import time
import itertools
import pandas as pd

In [57]:
#Gets a protein sequence in FASTA format given the ensembl transcript ID
def get_protein_sequence(canonical_transcript):
    server = "http://grch37.rest.ensembl.org"
    ext = "/sequence/id/" + canonical_transcript + "?content-type=text/x-fasta;type=protein"
    r = requests.get(server + ext, headers={"Content-Type": "text/x-fasta"})
    time.sleep(3)
    
    if not r.ok:
        try:
            r.raise_for_status()
            return "error"
        except requests.exceptions.HTTPError: #I should catch the error too or print, to find out which specific genes
            pass
    seqlist = r.text.split("\n", 1)
    
    if len(seqlist) == 2: #A lot of these transcripts are noncoding or introns, don't know why
        sequence = seqlist[1]
        sequence = sequence.replace("\n", "")
        return sequence
    #print("Sequence found")

#Adapted from https://github.com/xjenny2/phospho-programs/blob/master/ensembl.py

In [20]:
tsv_file = open("filtered_mapRef.tsv")
mart_tsv = csv.reader(tsv_file, delimiter="\t", quotechar='"')
df = pd.DataFrame(mart_tsv)
dfRef = df[df[6].astype(bool)] #filters tsv file such that it only considers all with a RefSeq Transcript

In [21]:
dfRef.shape

(15543, 7)

In [2]:
tsv_file = open("batchJob.txt")
mart_tsv = csv.reader(tsv_file, delimiter="\t", quotechar='"')

In [3]:
df = pd.DataFrame(mart_tsv)

In [4]:
df.shape

(27867, 106)

In [5]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,96,97,98,99,100,101,102,103,104,105
0,Input Variant,Errors,Chromosomal Variant,Coding Variant(s),,,,,,,...,,,,,,,,,,
1,NC_000001.11:g.943995C>T,,NC_000001.11:g.943995C>T,NM_015658.3:c.*699G>A,NM_152486.2:c.1888C>T,,,,,,...,,,,,,,,,,
2,NC_000001.11:g.944051T>G,,NC_000001.11:g.944051T>G,NM_015658.3:c.*643A>C,NM_152486.2:c.1944T>G,,,,,,...,,,,,,,,,,
3,NC_000001.11:g.1014143C>T,,NC_000001.11:g.1014143C>T,NM_005101.3:c.163C>T,,,,,,,...,,,,,,,,,,
4,NC_000001.11:g.1014359G>T,,NC_000001.11:g.1014359G>T,NM_005101.3:c.379G>T,,,,,,,...,,,,,,,,,,


In [6]:
df2 = df[[2,3]]

In [7]:
df2.rename(columns=df.iloc[0])

Unnamed: 0,Chromosomal Variant,Coding Variant(s)
0,Chromosomal Variant,Coding Variant(s)
1,NC_000001.11:g.943995C>T,NM_015658.3:c.*699G>A
2,NC_000001.11:g.944051T>G,NM_015658.3:c.*643A>C
3,NC_000001.11:g.1014143C>T,NM_005101.3:c.163C>T
4,NC_000001.11:g.1014359G>T,NM_005101.3:c.379G>T
5,NC_000001.11:g.1041582C>T,XM_006710635.2:c.1057C>T
6,NC_000001.11:g.1042053C>G,XM_006710635.2:c.1275C>G
7,NC_000001.11:g.1045487C>T,XM_006710635.2:c.2500C>T
8,NC_000001.11:g.1049672C>T,XM_006710635.2:c.4621C>T
9,NC_000001.11:g.1232517G>A,NM_080605.3:c.239G>A


In [11]:
df2 = df2[df2[3].str.contains("NM")]

In [13]:
df2 = df2[~df2[3].str.contains("-")]

In [16]:
df2 = df2[~df2[3].str.contains("\*")]

In [17]:
df2.shape

(8429, 2)

In [25]:
df2 = df2.rename(columns=df.iloc[0])

In [26]:
df2['ENST'] = ""

In [28]:
for index, row in df2.iterrows():
    NM = row[1].split('.')[0] #obtains the initial transcript
    for x, y in dfRef.iterrows():
        Match = y[6].split('.')[0] 
        if NM == Match: #checks a map for a transcript
            df2.loc[index, 'ENST'] = y[4]

In [30]:
nan_value = float("NaN")
df2.replace("", nan_value, inplace = True)
df2.dropna(subset = ['ENST'], inplace=True)

In [31]:
df2.shape

(4686, 4)

In [38]:
df2['ENST'] = df2['ENST'].str.replace("NaN ","")

In [39]:
df2

Unnamed: 0,Gene end (bp),Gene stable ID,Unnamed: 3,ENST
3,NC_000001.11:g.1014143C>T,NM_005101.3:c.163C>T,,ENST00000649529
4,NC_000001.11:g.1014359G>T,NM_005101.3:c.379G>T,,ENST00000649529
9,NC_000001.11:g.1232517G>A,NM_080605.3:c.239G>A,,ENST00000379198
10,NC_000001.11:g.1232959C>G,NM_080605.3:c.681C>G,,ENST00000379198
11,NC_000001.11:g.1233041C>T,NM_080605.3:c.763C>T,,ENST00000379198
15,NC_000001.11:g.1518966C>T,NM_001170535.1:c.490C>T,,ENST00000378756
16,NC_000001.11:g.1520284C>T,NM_001170535.1:c.658C>T,,ENST00000378756
17,NC_000001.11:g.1535392G>C,NM_001114748.1:c.489C>G,,ENST00000378733
19,NC_000001.11:g.2229502C>T,NM_003036.3:c.736C>T,,ENST00000378536
20,NC_000001.11:g.2306202C>A,NM_003036.3:c.1950C>A,,ENST00000378536


In [43]:
df2.dtypes

Gene end (bp)      object
Gene stable ID     object
                  float64
ENST               object
dtype: object

In [48]:
dfFinal = df2[['Gene end (bp)','Gene stable ID', 'ENST']]

In [53]:
dfFinal = dfFinal.rename(columns = {'Gene end (bp)':'Chromosomal Variant', 'Gene stable ID':'Coding Variant', 'ENST':'ENST'})

In [54]:
dfFinal

Unnamed: 0,Chromosomal Variant,Coding Variant,ENST
3,NC_000001.11:g.1014143C>T,NM_005101.3:c.163C>T,ENST00000649529
4,NC_000001.11:g.1014359G>T,NM_005101.3:c.379G>T,ENST00000649529
9,NC_000001.11:g.1232517G>A,NM_080605.3:c.239G>A,ENST00000379198
10,NC_000001.11:g.1232959C>G,NM_080605.3:c.681C>G,ENST00000379198
11,NC_000001.11:g.1233041C>T,NM_080605.3:c.763C>T,ENST00000379198
15,NC_000001.11:g.1518966C>T,NM_001170535.1:c.490C>T,ENST00000378756
16,NC_000001.11:g.1520284C>T,NM_001170535.1:c.658C>T,ENST00000378756
17,NC_000001.11:g.1535392G>C,NM_001114748.1:c.489C>G,ENST00000378733
19,NC_000001.11:g.2229502C>T,NM_003036.3:c.736C>T,ENST00000378536
20,NC_000001.11:g.2306202C>A,NM_003036.3:c.1950C>A,ENST00000378536


In [55]:
dfFinal['ProteinSeq'] = ""

In [56]:
dfFinal.shape

(4686, 4)

In [58]:
for index, row in dfFinal.iterrows():
    dfFinal.loc[index, 'ProteinSeq'] = get_protein_sequence(row[2])

In [59]:
dfFinal.dropna(subset = ['ProteinSeq'])

Unnamed: 0,Chromosomal Variant,Coding Variant,ENST,ProteinSeq
9,NC_000001.11:g.1232517G>A,NM_080605.3:c.239G>A,ENST00000379198,MKLLRRAWRRRAALGLGTLALCGAALLYLARCAAEPGDPRAMSGRS...
10,NC_000001.11:g.1232959C>G,NM_080605.3:c.681C>G,ENST00000379198,MKLLRRAWRRRAALGLGTLALCGAALLYLARCAAEPGDPRAMSGRS...
11,NC_000001.11:g.1233041C>T,NM_080605.3:c.763C>T,ENST00000379198,MKLLRRAWRRRAALGLGTLALCGAALLYLARCAAEPGDPRAMSGRS...
15,NC_000001.11:g.1518966C>T,NM_001170535.1:c.490C>T,ENST00000378756,MSWLFGINKGPKGEGAGPPPPLPPAQPGAEGGGDRGLGDRPAPKDK...
16,NC_000001.11:g.1520284C>T,NM_001170535.1:c.658C>T,ENST00000378756,MSWLFGINKGPKGEGAGPPPPLPPAQPGAEGGGDRGLGDRPAPKDK...
17,NC_000001.11:g.1535392G>C,NM_001114748.1:c.489C>G,ENST00000378733,MSMSANTMIFMILGASVVMAIACLMDMNALLDRFHNYILPHLRGED...
19,NC_000001.11:g.2229502C>T,NM_003036.3:c.736C>T,ENST00000378536,MEAAAGGRGCFQPHPGLQKTLEQFHLSSMSSLGGPAAFSARWAQEA...
20,NC_000001.11:g.2306202C>A,NM_003036.3:c.1950C>A,ENST00000378536,MEAAAGGRGCFQPHPGLQKTLEQFHLSSMSSLGGPAAFSARWAQEA...
21,NC_000001.11:g.2406561C>A,NM_002617.3:c.835G>T,ENST00000447513,MAPAAASPPEVIRAAQKDEYYRGGLRSAAGGALHSLAGARKWLEWR...
22,NC_000001.11:g.2406606C>A,NM_002617.3:c.790G>T,ENST00000447513,MAPAAASPPEVIRAAQKDEYYRGGLRSAAGGALHSLAGARKWLEWR...


In [60]:
dfFinal.to_csv('temp_final.csv')

In [61]:
import hgvs.parser
import hgvs.dataproviders.uta
import hgvs.assemblymapper
hdp = hgvs.dataproviders.uta.connect()
am = hgvs.assemblymapper.AssemblyMapper(hdp,assembly_name = 'GRCh38', alt_aln_method = "splign", replace_reference = True)

In [62]:
dfFinal['Protein Variant'] = ""

In [67]:
dfFinal = dfFinal[dfFinal['ENST'].astype(bool)]

In [72]:
dfFinal.to_csv('temp_final_3.csv')

In [79]:
df = dfFinal[dfFinal['ProteinSeq'].astype(str).str.startswith('M')]

In [85]:
df['Protein Variant'] = ""

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [93]:
for index, row in df.iloc[2768:-1].iterrows():
    var_c = hp.parse_hgvs_variant(row[1])
    var_p = am.c_to_p(var_c) #converts Transcript to Protein

    df.loc[index, 'Protein Variant'] = str(var_p)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000257.2&rettype=fasta&seq_start=5072&seq_stop=5072&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000257.2&rettype=fasta&seq_start=4712&seq_stop=4712&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000257.2&rettype=fasta&seq_start=4517&seq_stop=4517&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000257.2&rettype=fast

Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_001252024.1&rettype=fasta&seq_start=435&seq_stop=435&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_001013703.3&rettype=fasta&seq_start=3838&seq_stop=3838&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_144508.4&rettype=fasta&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_019074.3&rettype=fasta&seq_start=1996&seq_stop=1996&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_174916.2&re

Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_001271.3&rettype=fasta&seq_start=3984&seq_stop=3984&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_001271.3&rettype=fasta&seq_start=4357&seq_stop=4357&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_001271.3&rettype=fasta&seq_start=4578&seq_stop=4578&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_001271.3&rettype=fasta&seq_start=5610&seq_stop=5610&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/

Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000747.2&rettype=fasta&seq_start=289&seq_stop=289&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000180.3&rettype=fasta&seq_start=139&seq_stop=139&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000180.3&rettype=fasta&seq_start=1417&seq_stop=1417&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000180.3&rettype=fasta&seq_start=1656&seq_stop=1656&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entr

Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_005450.4&rettype=fasta&seq_start=911&seq_stop=911&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_003647.2&rettype=fasta&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000789.3&rettype=fasta&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000789.3&rettype=fasta&seq_start=3026&seq_stop=3026&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000334.4&retty

Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000528.3&rettype=fasta&seq_start=2963&seq_stop=2963&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000528.3&rettype=fasta&seq_start=2745&seq_stop=2745&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000528.3&rettype=fasta&seq_start=2005&seq_stop=2005&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000528.3&rettype=fasta&seq_start=1763&seq_stop=1763&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/

Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_001282933.1&rettype=fasta&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000557.4&rettype=fasta&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000557.4&rettype=fasta&seq_start=890&seq_stop=890&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_001080472.2&rettype=fasta&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_022095.3&rettype=fasta&tool=bioutils&

Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000444.5&rettype=fasta&seq_start=2541&seq_stop=2541&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000444.5&rettype=fasta&seq_start=2657&seq_stop=2657&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_000444.5&rettype=fasta&seq_start=2801&seq_stop=2801&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_139058.2&rettype=fasta&seq_start=1790&seq_stop=1790&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/

Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_001256789.2&rettype=fasta&seq_start=2934&seq_stop=2934&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_001256789.2&rettype=fasta&seq_start=2100&seq_stop=2100&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_001256789.2&rettype=fasta&seq_start=846&seq_stop=846&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_001127898.3&rettype=fasta&seq_start=726&seq_stop=726&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcg

Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_153252.4&rettype=fasta&seq_start=5352&seq_stop=5352&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_153252.4&rettype=fasta&seq_start=4278&seq_stop=4278&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_182541.2&rettype=fasta&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failure 1/3; retry in 2 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_012282.2&rettype=fasta&tool=bioutils&email=biocommons-dev@googlegroups.com
Failure 0/3; retry in 1 seconds
Failure 1/3; retry in 1 seconds
Failed to fetch https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?

In [94]:
df.dropna(subset = ['Protein Variant'])

Unnamed: 0,Chromosomal Variant,Coding Variant,ENST,ProteinSeq,Protein Variant
9,NC_000001.11:g.1232517G>A,NM_080605.3:c.239G>A,ENST00000379198,MKLLRRAWRRRAALGLGTLALCGAALLYLARCAAEPGDPRAMSGRS...,NP_542172.2:p.(Trp80Ter)
10,NC_000001.11:g.1232959C>G,NM_080605.3:c.681C>G,ENST00000379198,MKLLRRAWRRRAALGLGTLALCGAALLYLARCAAEPGDPRAMSGRS...,NP_542172.2:p.(Tyr227Ter)
11,NC_000001.11:g.1233041C>T,NM_080605.3:c.763C>T,ENST00000379198,MKLLRRAWRRRAALGLGTLALCGAALLYLARCAAEPGDPRAMSGRS...,NP_542172.2:p.(Gln255Ter)
15,NC_000001.11:g.1518966C>T,NM_001170535.1:c.490C>T,ENST00000378756,MSWLFGINKGPKGEGAGPPPPLPPAQPGAEGGGDRGLGDRPAPKDK...,NP_060658.3:p.(Gln212Ter)
16,NC_000001.11:g.1520284C>T,NM_001170535.1:c.658C>T,ENST00000378756,MSWLFGINKGPKGEGAGPPPPLPPAQPGAEGGGDRGLGDRPAPKDK...,NP_060658.3:p.(Gln268Ter)
17,NC_000001.11:g.1535392G>C,NM_001114748.1:c.489C>G,ENST00000378733,MSMSANTMIFMILGASVVMAIACLMDMNALLDRFHNYILPHLRGED...,NP_001108220.1:p.(Tyr163Ter)
19,NC_000001.11:g.2229502C>T,NM_003036.3:c.736C>T,ENST00000378536,MEAAAGGRGCFQPHPGLQKTLEQFHLSSMSSLGGPAAFSARWAQEA...,NP_003027.1:p.(Gln246Ter)
20,NC_000001.11:g.2306202C>A,NM_003036.3:c.1950C>A,ENST00000378536,MEAAAGGRGCFQPHPGLQKTLEQFHLSSMSSLGGPAAFSARWAQEA...,NP_003027.1:p.(Cys650Ter)
21,NC_000001.11:g.2406561C>A,NM_002617.3:c.835G>T,ENST00000447513,MAPAAASPPEVIRAAQKDEYYRGGLRSAAGGALHSLAGARKWLEWR...,NP_722540.1:p.(Glu299Ter)
22,NC_000001.11:g.2406606C>A,NM_002617.3:c.790G>T,ENST00000447513,MAPAAASPPEVIRAAQKDEYYRGGLRSAAGGALHSLAGARKWLEWR...,NP_722540.1:p.(Glu284Ter)


In [95]:
df.shape

(4139, 5)

In [98]:
df = df[~df['Protein Variant'].str.contains('\?')]

In [96]:
df.to_csv('final.csv')

In [100]:
df.shape

(4127, 5)