In [8]:
# Using exclusively pyspark:
import pandas as pd
import json
import requests
from functools import reduce

from pyspark.conf import SparkConf
from pyspark.sql.types import ArrayType, StringType, IntegerType, StructType
from pyspark.sql.dataframe import DataFrame 
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, struct, lit, when, udf, array, expr

sparkConf = (
    SparkConf()
    .set('spark.driver.memory', '15g')
    .set('spark.executor.memory', '15g')
    .set('spark.driver.maxResultSize', '0')
    .set('spark.debug.maxToStringFields', '2000')
    .set('spark.sql.execution.arrow.maxRecordsPerBatch', '500000')
)
spark = (
    SparkSession.builder
    .config(conf=sparkConf)
    .master('local[*]')
    .getOrCreate()
)

# This is a map that provides recipie to generate the biomarker objects
# If a value cannot be found in the map, the value will be returned.
biomarkerMaps = {
    
    'MS_status': {
        'description': 'Micro-satellite stability',
    },
    'CRIS_subtype': {
        'description': 'Colorectal cancer intrinsic subtypes (CRIS) defined by distinctive molecular, functional and phenotypic peculiarities',
        'prefix': 'CRIS-'
    },
    'KRAS_status': {
        'description': 'KRAS mutation status',
        'prefix': 'KRAS-'
    },
    'TP53_status': {
        'description': 'TP53 mutation status',
        'prefix': 'TP53-'
    },
    'APC_status': {
        'description': 'APC mutation status',
        'prefix': 'ACP-'
    }
}


@udf
def get_biomarker(columnName, biomarker):
    '''This function returns with a '''
    if biomarker == '?':
        return None
    
    # If no data is provided, we'll return the value as it is:
    if biomarkerMaps[columnName] == {}:
        return biomarker
    
    # If no prefix is defined, just return the value:
    if not 'prefix' in biomarkerMaps[columnName]:
        return biomarker
    
    return biomarkerMaps[columnName]['prefix'] + biomarker



In [10]:
parameters = {
  "experiments": [
    {
      "diseaseFromSourceMappedId": "EFO_0005842",
      "diseaseFromSource": "colorectal cancer",
      "confidenceCutoff": 38,
      "cellLineFile": "COlines.txt",
      "experimentData": "CTG_CO_Partner-Preview-Matrix_v6a.txt",
      "contrast": "Loss of cell viability vs control",
      "studyOverview": "CellTtreGio measurement",
      "projectId": "OTAR015",
      "projectDescription": "CRISPR Cas9 Target ID"
    }
  ],
  "sharedParemeters": {
    "cellPassportFile": "model_list_20211124.csv",
    "datasourceId": "ot_crispr_validation",
    "datatypeId": "ot_validation_lab"
  }
}

def get_cell_passport_data(spark: SparkSession, cell_passport_file: str) -> DataFrame:

    # loading cell line annotation data from Sanger:
    return (
        spark.read
        .option("multiline", True)
        .csv(cell_passport_file, header=True, sep=',', quote='"')
        .select(
            col('model_name').alias('name'),
            col('model_id').alias('id'),
            col('tissue')
        )
        # Some model names needs to be changed to match the Validation lab dataset:
        .withColumn(
            'name',
            when(col('name') == 'HT29', 'HT-29')
            .when(col('name') == 'HCT116', 'HCT-116')
            .when(col('name') == 'LS180', 'LS-180')
            .otherwise(col('name'))
        )
        .persist()
    )


diseaseCellLines_model = get_cell_passport_data(spark, parameters['sharedParemeters']['cellPassportFile'])
diseaseCellLines_model.show()

+-----------------+---------+--------------------+
|             name|       id|              tissue|
+-----------------+---------+--------------------+
|HCM-SANG-0314-C15|SIDM01274|           Esophagus|
|HCM-SANG-0272-C20|SIDM01276|     Large Intestine|
|HCM-SANG-0310-C15|SIDM01280|           Esophagus|
|              HLF|SIDM00065|               Liver|
|             TE-7|SIDM00066|           Esophagus|
|HCM-SANG-0278-C20|SIDM01282|     Large Intestine|
|HCM-SANG-0277-C18|SIDM01284|     Large Intestine|
|            TE-13|SIDM00076|           Esophagus|
|      NCI-ADR-RES|SIDM00089|               Ovary|
|        ONCO-DG-1|SIDM00093|               Ovary|
|         NCI-H125|SIDM00106|                Lung|
|            SNB19|SIDM00109|Central Nervous S...|
|            B2-17|SIDM00110|Central Nervous S...|
|HCM-SANG-0276-C18|SIDM01278|     Large Intestine|
|         COLO-320|SIDM00071|     Large Intestine|
|             DiFi|SIDM00049|     Large Intestine|
|            HCE-4|SIDM00052|  