We need to unify the way the diseaseCellLine object is generated. We need to have a way to read cell line data, and based on the available values, we return the parsed `diseaseCellLines` object.

In [1]:
from functools import reduce
import requests

import pandas as pd
from pyspark.sql.functions import (
    col, udf, struct, lit, split, expr, collect_set, struct, 
    regexp_replace, min as pyspark_min, explode,
    array_contains, count, when
)
from pyspark.sql.types import FloatType, ArrayType, StructType, StructField, BooleanType, StringType
from pyspark.sql import SparkSession
from collections import defaultdict
from pyspark import SparkFiles

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)



In [44]:
def lookup_uberon(tissue_label: str) -> str:
    url = f'https://www.ebi.ac.uk/ols/api/search?q={tissue_label.lower()}&queryFields=label&ontology=uberon&exact=true'
    r = requests.get(url).json()
    
    if r['response']['numFound'] == 0:
        return None
    else:
        return r['response']['docs'][0]['short_form']

def generate_diseaseCellLines(cellPassPortFile):
    
    # loading cell line annotation data from Sanger:
    cell_df = (
        spark.read
        .option("multiline", True)
        .csv(cellPassportUrl, header=True, sep=',', quote='"')
        .select(
            col('model_name').alias('name'),
            col('model_id').alias('id'),
            col('tissue')
        )
        .persist()
    )


    # Map tissue labels to tissue identifiers:
    tissues = (
        spark.createDataFrame( 
            cell_df
            .select('tissue')
            .distinct()
            .toPandas()
            .assign(
                tissueId = lambda df: df.tissue.apply(lookup_uberon)
            )
        )
        .persist()
    )

    # Joining with cell lines:
    return (
        cell_df
        .join(tissues, on='tissue', how='left')

        # Generating the diseaseCellLines object:
        .select('name', 'id', struct(['tissue', 'name', 'id', 'tissueId']).alias('diseaseCellLines'))

        # Cleaning up cell line name from dashes:
        .withColumn('name', regexp_replace(col('name'), '-', ''))
        
        .persist()
    )


cellPassportFile = '/Users/dsuveges/project_data/encore/ENCORE_FILES_DEC_2021/model_list_20220124.csv'

cellpassportMap = generate_diseaseCellLines(cellPassportFile)

In [45]:
cellpassportMap.show()

+----------------+---------+--------------------+
|            name|       id|    diseaseCellLines|
+----------------+---------+--------------------+
|            RH18|SIDM00454|{Soft Tissue, RH-...|
|              RD|SIDM00847|{Soft Tissue, RD,...|
|         SCCH196|SIDM00031|{Soft Tissue, SCC...|
|          TE125T|SIDM01763|{Soft Tissue, TE-...|
|          MFHino|SIDM00299|{Soft Tissue, MFH...|
|             RKN|SIDM00353|{Soft Tissue, RKN...|
|            KYM1|SIDM00552|{Soft Tissue, KYM...|
|           PEAZ1|SIDM00830|{Soft Tissue, PEA...|
|          Hs633T|SIDM00667|{Soft Tissue, Hs-...|
|              HX|SIDM01213|{Soft Tissue, HX,...|
|            A204|SIDM00798|{Soft Tissue, A20...|
|            G402|SIDM00855|{Soft Tissue, G-4...|
|          HT1080|SIDM00828|{Soft Tissue, HT-...|
|             GCT|SIDM00853|{Soft Tissue, GCT...|
|          SKLMS1|SIDM01109|{Soft Tissue, SK-...|
|CCLF_PEDS_0008_T|SIDM01982|{Soft Tissue, CCL...|
|          RH18DM|SIDM01984|{Soft Tissue, RH1...|


In [11]:
cell_df = (
        spark.read
        .option("multiline", True)
        .csv('/Users/dsuveges/project_data/encore/ENCORE_FILES_DEC_2021/model_list_20220124.csv', header=True, sep=',', quote='"')
        .withColumn('biomarkers', parse_msi_status(col('msi_status')))
        .select(
            col('model_name').alias('name'),
            col('model_id').alias('id'),
            col('tissue'),
            col('biomarkers')
        )
        .persist()
    )

cell_df.show()

+-----------------+---------+--------------------+--------------------+
|             name|       id|              tissue|          biomarkers|
+-----------------+---------+--------------------+--------------------+
|            PK-59|SIDM01774|            Pancreas|                null|
|         SNU-1033|SIDM00192|     Large Intestine|                null|
|          SNU-466|SIDM01447|Central Nervous S...|                null|
|        IST-MES-2|SIDM01554|                Lung|                null|
|           MUTZ-5|SIDM01689|Haematopoietic an...|                null|
|            TM-31|SIDM01460|Central Nervous S...|                null|
|          SNU-503|SIDM00497|     Large Intestine|                null|
|          SNU-878|SIDM01435|               Liver|                null|
|           JMSU-1|SIDM01587|             Bladder|                null|
|          HEC-265|SIDM01612|         Endometrium|                null|
|            BL-70|SIDM00074|Haematopoietic an...|              

In [6]:
(
    cell_df
    .select('msi_status',)
    .groupby('msi_status')
    .count()
    .show()
)

+----------+-----+
|msi_status|count|
+----------+-----+
|       MSS|  981|
|      null|  963|
|       MSI|   63|
+----------+-----+



In [10]:
import requests

@udf(
    ArrayType(
        StructType([
            StructField('name', StringType()), 
            StructField('description', StringType())
        ])
    )
)
def parse_msi_status(status: str) -> dict:
    if status == 'MSI':
        return     [{"name": "MSI","description": "Microsatellite instable"}]
    if status == 'MSS':
        return     [{"name": "MSS","description": "Microsatellite stable"}]
    else:
        return None
    
    

In [35]:
tissues

Unnamed: 0,tissue,tissueId
0,Soft Tissue,
1,Skin,
2,Adrenal Gland,UBERON_0002369
3,Head and Neck,
4,Cervix,
5,Bone,
6,Peripheral Nervous System,UBERON_0000010
7,Central Nervous System,UBERON_0001017
8,Unknown,
9,Endometrium,UBERON_0001295


In [12]:
(
    spark.read.json('/Users/dsuveges/project_data/cancer_biomarkers/cancer_biomarkers-2022-01-24.json.gz')
    .printSchema()
)

root
 |-- biomarkerName: string (nullable = true)
 |-- biomarkers: struct (nullable = true)
 |    |-- geneExpression: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |-- variant: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- functionalConsequenceId: string (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |-- confidence: string (nullable = true)
 |-- datasourceId: string (nullable = true)
 |-- datatypeId: string (nullable = true)
 |-- diseaseFromSource: string (nullable = true)
 |-- diseaseFromSourceMappedId: string (nullable = true)
 |-- drugFromSource: string (nullable = true)
 |-- drugResponse: string (nullable = true)
 |-- literature: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- targetFromSourceId: s