In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import requests
import json
import logging
log = logging.getLogger("mutations")
log.addHandler(logging.StreamHandler())
log.setLevel(logging.DEBUG)

In [None]:
case_set = pd.read_csv("../keywordTCGA/brca/tetrasbm/trisbm/trisbm_level_0_topic-dist.csv")["doc"].values

## Search

In [None]:
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ['TCGA-BRCA']
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        },
        #{
        #"op":"in",
        #"content":{  
        #    "field":"case.submitter_id",
        #    "value": list(case_set[:100])
        #   } 
        #}
    ]
}

In [None]:
def get_gene_from_ssm_id(ssm):
    response = requests.get(f'https://api.gdc.cancer.gov/ssms/{ssm}?pretty=true&expand=consequence.gene')
    ret = response.json()
    if len(ret["data"]["gene_aa_change"]) < 1:
        return None
    return ret["data"]["gene_aa_change"][0].split(" ")[0]

In [None]:
def get_filters(cases):
    return { "op": "and",
                "content":
                [
                    {
                    "op":"in",
                    "content":{  
                        "field":"case.submitter_id",
                        "value": list(cases)
                       }
                    }
                ]
            }

In [None]:
def append_case(case_series):
    global df_ssm
    if (case_series is not None):
        log.debug(case_series.name)
        df_ssm = df_ssm.join(case_series, how="outer")
        
def get_case_series(case):
    try:
        params = {
            "filters": json.dumps(get_filters([case])),
            "format": "TSV",
            "size": "50000"
            }
        response = requests.get("https://api.gdc.cancer.gov/ssm_occurrences?expand=ssm", headers = {"Content-Type": "application/json"}, params = params)
        data = [row.replace("\r","").split("\t") for row in response.content.decode("utf-8").split("\n")]
        df_case_ssm_occurrences = pd.DataFrame(columns = data[0], data=data[1:-1]).dropna(how="all", axis=0)
        #return  pd.Series(name=case, index = list(map(get_gene_from_ssm_id,df_case_ssm_occurrences["ssm.ssm_id"])), data=1)
        return  pd.Series(name=case, index = df_case_ssm_occurrences["ssm.ssm_id"], data=1)
    except:
        return None

In [None]:
get_case_series(case_set[0])

In [None]:
df_ssm = pd.DataFrame()
append_case(get_case_series(case_set[0]))
df_ssm

In [None]:
import multiprocessing as mp
from time import time

df_ssm = pd.DataFrame()

start = time()

poolssm = mp.Pool(12)
wssm = [
    poolssm.apply_async(get_case_series, args=([case]), callback=append_case, error_callback=log.error)
    for case in case_set]
poolssm.close()

poolssm.join()

df_ssm = df_ssm[~df_ssm.index.isna()]
df_ssm = df_ssm[df_ssm.apply(np.nansum, 1)>1].fillna(0).astype(int)

time()-start

In [None]:
df_ssm.to_csv("../keywordTCGA/mainTable_ssm.csv")

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
df_ssm.mean(1).sort_values(ascending=False).plot(ax=ax)
ax.plot([1,1000], [0.06,6e-2*(1000**-0.55)])
ax.set_xscale("log")
ax.set_yscale("log")