# Overview

> Overview of Steps for Dataset Development

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

## List of Genes
Acquire a list of genes for which we will be collecting variants

In [None]:
import pandas as pd

### Download Dependency File
Download [this](http://ftp.ensembl.org/pub/release-107/gtf/homo_sapiens/Homo_sapiens.GRCh38.107.chr.gtf.gz) file and place in `data/ensembl/ensembl/pub/release-107/gtf/homo_sapiens/`

In [None]:
gtf = pd.read_csv("data/ensembl/ensembl/pub/release-107/gtf/homo_sapiens/Homo_sapiens.GRCh38.107.gtf",
           delimiter="\t",comment="#",header=None)

In [None]:
gtf.columns = ["chr", "source","type", "start", "end","score","strand",
              "phase", "info"]

In [None]:
gtf.head()

In [None]:
parseinfo = lambda cell: dict([i.strip().split('"')[:-1] for i in cell.split(";")][:-1])

In [None]:
genes = pd.DataFrame.from_records(gtf[gtf.type == "gene"]["info"].apply(parseinfo))

genes.columns = [g.strip() for g in genes.columns]

In [None]:
coding_genes = genes[genes["gene_biotype"] == "protein_coding"]

In [None]:
coding_genes

## Query Variants for Each Gene
> We will use [myvariant.info](myvariant.info) to query databases for variants of interest

In [None]:
import requests

In [None]:
import json

In [None]:
from easydict import EasyDict

Below is an example query to [myvariant.info](myvariant.info) for all missense variants for the gene `ATAD3B`

In [None]:
#| export
def queryGene(ensg,missenseOnly=True):
    """
    Query myvariant.info for all results relevant to the given ensembl gene id
    
    Required Arguments:
    - ensg : Ensembl Gene ID (e.g. ENSG00000160072)
    
    Optional Arugments:
    - missenseOnly : default True : whether to query only missense variants
    """
    q = f"cadd.gene.gene_id:{ensg}"
    if missenseOnly:
        q += "%20AND%20cadd.consdetail:missense"
    r = requests.get(f'https://myvariant.info/v1/query?q={q}&fetch_all=TRUE')
    if not r.status_code == 200:
        raise ValueError()
    queryResult = EasyDict(json.loads(r.text))
    queries = []
    querySucceeded = lambda q: ("success" not in q or q.success)
    while "_scroll_id" in queryResult and querySucceeded(queryResult):
        r = requests.get(f'https://myvariant.info/v1/query?scroll_id={queryResult._scroll_id}')
        queries.append(queryResult)
        if not r.status_code == 200:
            raise ValueError()
        queryResult = EasyDict(json.loads(r.text))
    if querySucceeded(queryResult):
        queries.append(queryResult)
    return queries

In [None]:
results = queryGene(coding_genes.loc[0,"gene_id"])

In [None]:
len(results)

In [None]:
sum([len(r.hits) for r in results])

Below is an example of the minimum information we need to construct a variant sequence. We can query ensembl for the reference sequence then make the change indicated by the variant

In [None]:
results[-1].hits[0].dbnsfp.aa, results[-1].hits[0].dbnsfp.ensembl.transcriptid