In [None]:
from cyvcf2 import VCF
from tqdm.notebook import tqdm

In [None]:
aaTable = {"Ala":"A",
           "Arg": "R",
           "Asn": "N",
           "Asp": "D",
           "Cys": "C",
           "Gln": "Q",
           "Glu": "E",
           "Gly": "G",
           "His": "H",
           "Ile": "I",
           "Leu": "L",
           "Lys": "K",
           "Met": "M",
           "Phe": "F",
           "Pro": "P",
           "Ser": "S",
           "Thr": "T",
           "Trp": "W",
           "Tyr": "Y",
           "Val": "V"}

In [None]:
missenseCount = 0
hgvs = []
for variant in tqdm(VCF("/data/projects/processBio/gnomad/gnomad.exomes.r2.1.1.sites.vcf")):
    vep = variant.INFO.get("vep")
    if len(vep):
        vep = vep.split("|")
    else:
        continue
    if "missense" in vep[1]:
        missenseCount += 1
        hgvs.append(vep[11])

In [None]:
missenseCount

In [None]:
len(hgvs)

In [None]:
hgvs

In [None]:
ensemblIDS = set([p.split(":")[0] for p in hgvs])

In [None]:
len(ensemblIDS)

In [None]:
from io import StringIO

import urllib.parse
import urllib.request

url = 'https://www.uniprot.org/uploadlists/'

params = {
'from': 'ENSEMBL_PRO_ID',
'to': 'ACC',
'format': 'tab',
'query': ' '.join([i[:i.find(".")] for i in list(ensemblIDS)])
}

data = urllib.parse.urlencode(params)
data = data.encode('utf-8')
req = urllib.request.Request(url, data)
with urllib.request.urlopen(req) as f:
    response = f.read()

idmapping = dict([r.split("\t") for r in response.decode('utf-8').split("\n")[1:-1]])

In [None]:
len(idmapping)

In [None]:
def get_uniprot_sequences(uniprot_ids):
        """
        Retrieve uniprot sequences based on a list of uniprot sequence identifier.

        For large lists it is recommended to perform batch retrieval.

        documentation which columns are available:
        https://www.uniprot.org/help/uniprotkb%5Fcolumn%5Fnames

        this python script is based on
        https://www.biostars.org/p/67822/

        Parameters:
            uniprot_ids: List, list of uniprot identifier

        Returns:
            pd.DataFrame, pandas dataframe with uniprot id column and sequence
        """
        import urllib
        import pandas as pd
        url = 'https://www.uniprot.org/uploadlists/'  # This is the webserver to retrieve the Uniprot data
        params = {
            'from': "ACC",
            'to': 'ACC',
            'format': 'tab',
            'query': " ".join(uniprot_ids),
            'columns': 'id,sequence'}

        data = urllib.parse.urlencode(params)
        data = data.encode('ascii')
        request = urllib.request.Request(url, data)
        with urllib.request.urlopen(request) as response:
            res = response.read()
        print(res)
        df_fasta = pd.read_csv(StringIO(res.decode("utf-8")), sep="\t")
        df_fasta.columns = ["Entry", "Sequence", "Query"]
        # it might happen that 2 different ids for a single query id are returned, split these rows
        return df_fasta.assign(Query=df_fasta['Query'].str.split(',')).explode('Query')

In [None]:
uniprot = get_uniprot_sequences(idmapping.values())

In [None]:
uniprot

In [None]:
variantSeqs = []
for variant in tqdm(hgvs):
    id_, variant = variant.split(":")
    og,loc,var = variant[2:5], int(variant[5:-3])-1, variant[-3:]
    ensp = id_[:id_.find(".")]
    if ensp not in idmapping:
        continue
    seq = uniprot[uniprot.Entry == idmapping[ensp]].Sequence
    if og in aaTable and var in aaTable and loc < len(seq):
        variantSeqs.append(seq[:loc] + var + seq[loc+1:])

In [None]:
variantSeqs