In [None]:
# default_exp data.gene

In [None]:
# export
import pandas as pd
from easydict import EasyDict
import numpy as np
import ensembl_rest
from tqdm.notebook import tqdm
def getSequence(gene):
    "Get the protein sequence coded by the given gene"
    q = ensembl_rest.symbol_lookup(
        species='homo sapiens',
        symbol=gene,params={'expand':True})
    try:
        is_canonical = [t["is_canonical"] for t in q["Transcript"]]
        t = q["Transcript"][np.argmax(is_canonical)]
        seq = ensembl_rest.sequence_id(t["Translation"]["id"])["seq"]
        return seq
    except:
        return ""

In [None]:
neutral_df = pd.read_excel("data/Neutral_dbSNP_build_131_mapped.xlsx")

In [None]:
pathogenic_df = pd.read_excel("data/Pathogenic_SNP_mapped.xlsx")

In [None]:
neutral_df

In [None]:
gene_symbols = pd.DataFrame({"symbol":neutral_df["HGNC Gene Symbol"].unique()}).set_index("symbol")

gene_symbols["seq"] = np.nan

In [None]:
for symbol in tqdm(gene_symbols.index,total=gene_symbols.shape[0]):
    if pd.isna(gene_symbols.loc[symbol,"seq"]):
        try:
            gene_symbols.loc[symbol,"seq"] = getSequence(symbol)
        except:
            pass

In [None]:
def checkVariant(idx, df, gene_df):
    ref = df.loc[idx,"RefProtResidue(One-letter)"]
    loc = df.loc[idx,"AminoAcidPosition(0 based)"]
    missense = df.loc[idx,"MissenseResidue(One_Letter)"]
    symbol = df.loc[idx, "HGNC Gene Symbol"]
    seq= gene_df.loc[symbol,"seq"]
    try:
        return seq[loc] == ref
    except:
        return False

In [None]:
for row in tqdm(range(neutral_df.shape[0]),total=neutral_df.shape[0]):
    neutral_df.loc[row,"valid_ref_allele"] = checkVariant(row,neutral_df, gene_symbols) 

In [None]:
neutral_df["valid_ref_allele"].value_counts()

In [None]:
valid_neutral = neutral_df[neutral_df["valid_ref_allele"]]

In [None]:
import matlab.engine
import matlab
eng = matlab.engine.start_matlab()
path = eng.addpath("~/Documents/research/mutpred2/all_functions/")

In [None]:
import pickle

In [None]:
for symbol in tqdm(gene_symbols.index,total=gene_symbols.shape[0]):
    try:
        gene_symbols.loc[symbol,"pssm"] = pickle.dumps(np.array(eng.get_pssm(gene_symbols.loc[symbol,"seq"],
                                                                             "~/Documents/research/mutpred2/")))
    except:
        pass

In [None]:
for p in gene_symbols.index:
    mat = pickle.loads(gene_symbols.loc[p,"pssm"])
    if (mat == 0).all():        
        gene_symbols.loc[p,"pssm"] = np.nan

In [None]:
gene_symbols.to_pickle("data/neutral_gene_symbol_df.pkl")

neutral_df.to_pickle("data/neutral_df.pkl")

# Repeat for pathogenic

In [None]:
pathogenic_genes = pd.DataFrame({"symbol":pathogenic_df["HGNC Gene Symbol"].str.strip().drop_duplicates()}).set_index("symbol")

In [None]:
pathogenic_genes["seq"] = np.nan
pathogenic_genes["pssm"] = np.nan

In [None]:
pathogenic_genes

In [None]:
for symbol in tqdm(pathogenic_genes.index,total=pathogenic_genes.shape[0]):
    try:
        pathogenic_genes.loc[symbol,"seq"] = getSequence(symbol)
    except Exception as e:
        print("skipping ",symbol)

In [None]:
for symbol in tqdm(pathogenic_genes.index,total=pathogenic_genes.shape[0]):
    if not pd.isna(pathogenic_genes.loc[symbol,"seq"]):
        try:
            pathogenic_genes.loc[symbol,"pssm"] = pickle.dumps(np.array(eng.get_pssm(pathogenic_genes.loc[symbol,"seq"],
                                                                                 "~/Documents/research/mutpred2/")))
        except:
            print("cannot find PSSM for ",symbol,seq)
    else:
        print("no sequence found for ",symbol)

In [None]:
for p in pathogenic_genes.index:
    if not pd.isna(pathogenic_genes.loc[p,"pssm"]):
        mat = pickle.loads(pathogenic_genes.loc[p,"pssm"])
        if (mat == 0).all():        
            pathogenic_genes.loc[p,"pssm"] = np.nan

In [None]:
pathogenic_genes["pssm"].isna().value_counts()

In [None]:
for row in tqdm(range(pathogenic_df.shape[0]),total=pathogenic_df.shape[0]):
    variant = pathogenic_df.loc[row,"Variation- HGVS format"].replace("p.","")
    ref = variant[0]
    loc = int(variant[1:-1])
    try:
        symbol = pathogenic_df.loc[row, "HGNC Gene Symbol"]
        sequence = pathogenic_genes.loc[symbol,"seq"]
        pathogenic_df.loc[row,"valid_ref_allele"] = sequence[loc-1] == ref
    except Exception as e:
        if type(e) is not KeyError:
            print(type(e))
        pathogenic_df.loc[row, "valid_ref_allele"] = False

In [None]:
pathogenic_df["HGNC Gene Symbol"].isin(pathogenic_genes.index).value_counts()

In [None]:
pathogenic_genes.to_pickle("data/pathogenic_gene_symbol_df.pkl")

pathogenic_df.to_pickle("data/pathogenic_df.pkl")

# Summarize

In [None]:
neutral_df = pd.read_pickle("data/neutral_df.pkl")
neutral_genes = pd.read_pickle("data/neutral_gene_symbol_df.pkl")

pathogenic_genes = pd.read_pickle("data/pathogenic_gene_symbol_df.pkl")
pathogenic_df = pd.read_pickle("data/pathogenic_df.pkl")

Total Variant Counts

In [None]:
neutral_df["valid_ref_allele"].value_counts()

In [None]:
pathogenic_df["valid ref allele"].value_counts()

Gene Type Counts

In [None]:
mixedGenes = set(neutral_genes.index).intersection(set(pathogenic_genes.index))

In [None]:
len(mixedGenes)

In [None]:
len(set(pathogenic_genes.index).difference(mixedGenes)),pathogenic_genes.shape[0]

In [None]:
len(set(neutral_genes.index).difference(mixedGenes)),neutral_genes.shape[0]

In [None]:
genes = pd.concat((neutral_genes,pathogenic_genes)).drop_duplicates()

In [None]:
genes

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(neutral_df[neutral_df["valid_ref_allele"]].groupby("HGNC Gene Symbol").size().values, bins=25)

In [None]:
plt.hist(pathogenic_df[pathogenic_df["valid ref allele"]].groupby("HGNC Gene Symbol").size().values, bins=25)

In [None]:
pathogenic_df["pathogenic"] = True

In [None]:
neutral_df["pathogenic"] = False

In [None]:
pathogenic_df.head()

In [None]:
neutral_df.head()

In [None]:
neutral_df[["MissenseResidue(One_Letter)", "AminoAcidPosition(0 based)","RefProtResidue(One-letter)"]]

In [None]:
pathogenic_df = pd.concat((pathogenic_df,
           pd.DataFrame(pathogenic_df["Variation- HGVS format"].str.replace("p.","").str.split(pat=r"(\d+)").to_list(),columns=["missense", "position", "reference"])),axis=1)

In [None]:
pathogenic_df["position"] = pathogenic_df["position"].astype(int) -1

In [None]:
neutral_df = neutral_df.rename(columns={"MissenseResidue(One_Letter)":"missense",
                   "AminoAcidPosition(0 based)":"position",
                   "RefProtResidue(One-letter)":"reference"})

In [None]:
variant_df = pd.concat((neutral_df[["valid_ref_allele", "pathogenic", "HGNC Gene Symbol","missense","position","reference"]],
                        pathogenic_df[["valid_ref_allele", "pathogenic", "HGNC Gene Symbol","missense","position","reference"]]))

In [None]:
variant_df

In [None]:
plt.hist(variant_df[variant_df["valid_ref_allele"]].groupby("HGNC Gene Symbol").size().values,bins=np.arange(0,50,2))

In [None]:
variant_df

In [None]:
genes

In [None]:
variant_df.to_pickle("data/variant_df_mixed.pkl")

In [None]:
genes.to_pickle("data/gene_df_mixed.pkl")

In [None]:
pickle.loads(genes.loc["A2M","pssm"]).shape

In [None]:
q = ensembl_rest.variation_id(id="rs903331232",species="homo_sapiens",fields=["hgvsc"],params={"expand":True,
                                                                                               "fields":["hgvsc"]})

In [None]:
q

In [None]:
getSequence("SAMD11")

In [None]:
import requests, sys
 
server = "https://rest.ensembl.org"
ext = "/variant_recoder/human/rs903331232?"
 
r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
 
if not r.ok:
    r.raise_for_status()
    sys.exit()

decoded = r.json()
# print(repr(decoded))

In [None]:
ensembl_rest.lookup(id="NM_152486")