# Valiadation lab data

In this release the following actions were decided:

* We need to have a biomarker description. Now the data looks like this:
```json
{
    "biomarkers": [
        {
            "name": "MSI",
            "description": "Microsatellite stability"
        }
    ]
}
```
* `confidence` and `expectedConfidence` can have two values: `significant` or `not significant`.

This is something expected:

```json
{
  "datasourceId": "ot_crispr_validation",
  "dataTypeId": "ot_validation_lab",
  "projectId": "OTAR015",
  "projectDescription": "CRISPR Cas9 Target ID",
  "targetFromSource": "ARHGEF7",
  "targetId": "ENSG00000102606",
  "diseaseFromSourceMappedId": "EFO_0005842",
  "diseaseId": "EFO_0005842",
  "resourceScore": 59.9,
  "confidence": "significant",
  "expectedConfidence": "significant",
  "diseaseCellLines": [
    {
      "name": "SW626",
      "id": "SIDM01168",
      "tissue": "Large Intestine",
      "tissueId": "UBERON_0000059"
    }
  ],
  "biomarkers": [
    {
      "name": "MSS"
    },
    {
      "name": "KRAS-mut"
    },
    {
      "name": "TP53-mut"
    },
    {
      "name": "ACP-mut"
    }
  ],
  "statisticalTestTail": "upper tail",
  "contrast": "Loss of cell viability vs control",
  "studyOverview": "CellTitreGio measurement",
  "validationHypotheses": [
    {
      "hypothesis": "MSI",
      "description": "Microsatellite stability."
    }
  ]
}
```

In [69]:
# Using exclusively pyspark:
import pandas as pd
import json
import requests
from functools import reduce

from pyspark.conf import SparkConf
from pyspark.sql.types import ArrayType, StringType, IntegerType, StructType, StructField
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, struct, lit, when, udf, array, expr

sparkConf = (
    SparkConf()
    .set('spark.driver.memory', '15g')
    .set('spark.executor.memory', '15g')
    .set('spark.driver.maxResultSize', '0')
    .set('spark.debug.maxToStringFields', '2000')
    .set('spark.sql.execution.arrow.maxRecordsPerBatch', '500000')
)
spark = (
    SparkSession.builder
    .config(conf=sparkConf)
    .master('local[*]')
    .getOrCreate()
)


cell_lines_file = '/Users/dsuveges/project_data/validation_lab/COlines.txt'
cell_lines_annotation_file = '/Users/dsuveges/project_data/validation_lab/model_list_20211124.csv'
cell_cancer_driver_mutation = '/Users/dsuveges/project_data/validation_lab/mutations_summary_20211124.csv'
validation_file = '/Users/dsuveges/project_data/validation_lab/CTG_CO_Partner-Preview-Matrix_v6a.txt'


# This is a map that provides recipie to generate the biomarker objects
# If a value cannot be found in the map, the value will be returned.
biomarkerMaps = {
    
    'MS_status': {
        'description': 'Micro-satellite stability',
    },
    'CRIS_subtype': {
        'description': 'Colorectal cancer intrinsic subtypes (CRIS) defined by distinctive molecular, functional and phenotypic peculiarities',
        'prefix': 'CRIS-'
    },
    'KRAS_status': {
        'description': 'KRAS mutation status',
        'prefix': 'KRAS-'
    },
    'TP53_status': {
        'description': 'TP53 mutation status',
        'prefix': 'TP53-'
    },
    'APC_status': {
        'description': 'APC mutation status',
        'prefix': 'ACP-'
    }
}


@udf
def get_biomarker(columnName, biomarker):
    '''This function returns with a '''
    if biomarker == '?':
        return None
    
    # If no data is provided, we'll return the value as it is:
    if biomarkerMaps[columnName] == {}:
        return biomarker
    
    # If no prefix is defined, just return the value:
    if not 'prefix' in biomarkerMaps[columnName]:
        return biomarker
    
    return biomarkerMaps[columnName]['prefix'] + biomarker



In [57]:
# loading cell line annotation data from Sanger:
diseaseCellLines_model = (
    spark.read
    .option("multiline",True)
    .csv(cell_lines_annotation_file, header=True, sep=',', quote='"')
    .select(
        col('cat_number').alias('name'),
        col('model_id').alias('id'),
        col('tissue')
    )
    .persist()
)

# Reading cell metadata from validation lab:
validation_lab_cell_lines = (
    spark.read.csv(cell_lines_file, sep='\t', header=True)
    
    # Renaming columns:
    .withColumnRenamed('CO_line', 'name')
    
    # Updating some of the cell lines' name:
#     .withColumn('name',
#          when(col('name') == 'HT29', 'HT-29')
#         .when(col('name') == 'HCT116', 'HCT-116')
#         .when(col('name') == 'LS180', 'LS-180')
#         .otherwise(col('name'))
#     )  
    
    # Joining dataset with cell model data read downloaded from Sanger website:
    .join(diseaseCellLines_model, on='name', how='left')
    
    # Adding UBERON code to tissues (it's constant colon)
    .withColumn('tissueID', lit('UBERON_0000059'))
    
    # generating disease cell lines object:
    .withColumn(
        'diseaseCellLines',
        array(struct(col('name'), col('id'), col('tissue'), col('tissueId')))
    )
    .drop(*['id', 'tissue', 'tissueId'])
    .persist()
)


validation_lab_cell_lines.show()

+------+---------+------------+-----------+-----------+----------+--------------------+
|  name|MS_status|CRIS_subtype|KRAS_status|TP53_status|APC_status|    diseaseCellLines|
+------+---------+------------+-----------+-----------+----------+--------------------+
| SW626|      MSS|           ?|        mut|        mut|       mut|[{SW626, null, nu...|
|  HT29|      MSS|           B|         wt|        mut|       mut|[{HT29, null, nul...|
| SW837|      MSS|           B|        mut|        mut|       mut|[{SW837, null, nu...|
| MDST8|      MSS|           D|         wt|         wt|       mut|[{MDST8, null, nu...|
|HCT116|      MSI|           D|        mut|         wt|        wt|[{HCT116, null, n...|
|  KM12|      MSI|           A|         wt|        mut|        wt|[{KM12, null, nul...|
|   RKO|      MSI|           ?|         wt|         wt|        wt|[{RKO, null, null...|
| LS180|      MSI|           A|        mut|         wt|       mut|[{LS180, null, nu...|
+------+---------+------------+-

In [48]:
# Defining how to process biomarkers:
# 1. Looping through all possible biomarker - from biomarkerMaps.keys()
# 2. The biomakers are then looked up in the map and process based on how the map defines.
# 3. Description is also added read from the map.
expressions =  map(
    # Function to process biomarker:
    lambda biomarker: (
        biomarker, 
        struct( 
            get_biomarker(lit(biomarker), col(biomarker)).alias('name'), 
            lit(biomarkerMaps[biomarker]['description']).alias('description'))
    ),
    # Iterator to apply the function over:
    biomarkerMaps.keys()
)

# Applying the full map on the dataframe one-by-one:
biomarkers = reduce(lambda DF,value: DF.withColumn(*value), expressions, validation_lab_cell_lines)

# Pooling together all the biomarker structures into one single array:
biomarkers = (
    biomarkers
    .select('name',array(*biomarkerMaps.keys()).alias('biomarkers'))
)


# Joining biomarkers with cell line data:
validation_lab_cell_lines = (
    validation_lab_cell_lines
    .join(biomarkers, on='name', how='inner')
    
    # Dropping biomarker columns:
    .drop(*list(biomarkerMaps.keys()))
    .persist()
)


validation_lab_cell_lines.show()

+-------+--------------------+--------------------+
|   name|    diseaseCellLines|          biomarkers|
+-------+--------------------+--------------------+
|  SW626|[{SW626, SIDM0116...|[{MSS, Micro-sate...|
|  HT-29|[{HT-29, SIDM0013...|[{MSS, Micro-sate...|
|  SW837|[{SW837, SIDM0083...|[{MSS, Micro-sate...|
|  MDST8|[{MDST8, SIDM0052...|[{MSS, Micro-sate...|
|HCT-116|[{HCT-116, SIDM00...|[{MSI, Micro-sate...|
|   KM12|[{KM12, SIDM00150...|[{MSI, Micro-sate...|
|    RKO|[{RKO, SIDM01090,...|[{MSI, Micro-sate...|
| LS-180|[{LS-180, SIDM006...|[{MSI, Micro-sate...|
+-------+--------------------+--------------------+



In [49]:
# evidence = (
(
    # Reading evidence:
    spark.read.csv(validation_file, sep='\t', header=True)
    
    # Rename existing columns need to be updated:
    .withColumnRenamed('gene', 'targetFromSource')
    .withColumnRenamed('cell-line', 'name')
    
    # Parsing resource score:
    .withColumn('resourceScore', col('effect-size').cast("double"))
    
    # Generate the binary confidence calls:
    .withColumn(
        'confidence',
        when(col('resourceScore') >= 38, lit('significant'))
        .otherwise(lit('not significant'))
    )
    .withColumn(
        'expectedConfidence',
        when(col('expected-to-pass') == 'TRUE', lit('significant'))
        .otherwise(lit('not significant'))
    )
    
    # Adding constants:
    .withColumn('statisticalTestTail', lit('upper tail'))
    .withColumn('contrast', lit('Loss of cell viability vs control'))
    .withColumn('studyOverview', lit('CellTitreGio measurement'))
    
    # This column is specific for this dataset:
    .withColumn('datasourceId', lit('ot_crispr_validation'))
    .withColumn('datatypeId', lit('ot_validation_lab'))
    .withColumn("diseaseFromSourceMappedId", lit("EFO_0005842"))
    
    # This should be added to the crispr dataset as well:
    .withColumn('projectId', lit('OTAR015'))
    .withColumn('projectDescription', lit('CRISPR Cas9 Target ID'))
    
    # This column is specific for genes, will be updated later:
    .withColumn('validationHypotheses', 
        struct(
            lit('MSI').alias('hypothesis'), 
            lit('This description will be provided by the validation lab').alias('description')
        )
    )

    # Joining cell line data:
    .join(validation_lab_cell_lines, on='name', how='left')
    
    # Drop unused columns:
    .drop(*['name', 'pass-fail', 'expected-to-pass', 'effect-size'])
    
    # Save data:
    .write.format('json').mode('overwrite').option('compression', 'gzip').save('validation_v3.json.gz')
#     .show(1, vertical=True, truncate=False)
)


In [55]:
%%bash


gzcat validation_v3.json.gz/*json.gz | head -n5 | tail -n1| jq



{
  "targetFromSource": "ARHGEF7",
  "resourceScore": 36.31,
  "confidence": "not significant",
  "expectedConfidence": "not significant",
  "statisticalTestTail": "upper tail",
  "contrast": "Loss of cell viability vs control",
  "studyOverview": "CellTitreGio measurement",
  "datasourceId": "ot_crispr_validation",
  "datatypeId": "ot_validation_lab",
  "diseaseFromSourceMappedId": "EFO_0005842",
  "projectId": "OTAR015",
  "projectDescription": "CRISPR Cas9 Target ID",
  "validationHypotheses": {
    "hypothesis": "MSI",
    "description": "This description will be provided by the validation lab"
  }
}


In [56]:
diseaseCellLines_model = (
    spark.read
    .option("multiline",True)
    .csv(cell_lines_annotation_file, header=True, sep=',', quote='"')
    .filter(col('model_name') == 'HCT-116')
    .show(1, vertical=True, truncate=False)
)

-RECORD 0------------------------------------------
 model_id                 | SIDM00783              
 model_name               | HCT-116                
 synonyms                 | null                   
 model_type               | Cell Line              
 growth_properties        | Adherent               
 doi                      | null                   
 pmed                     | null                   
 model_treatment          | null                   
 model_comments           | null                   
 msi_status               | MSI                    
 mutational_burden        | 70.63333333            
 ploidy                   | 1.97598793             
 parent_id                | null                   
 crispr_ko_data           | True                   
 sample_id                | SIDS00044              
 tissue                   | Large Intestine        
 tissue_status            | Tumour                 
 cancer_type              | Colorectal Carcinoma   
 cancer_type

In [74]:
@udf(StructType([
    StructField("name", StringType(), False),
    StructField("say", StringType(), False)
]))
def u(s):
    return None


(
    spark.createDataFrame([{'a': 'kutya', 'b': 123},{'a': 'bagoly', 'b': 13}])
    .withColumn('c', u(col('a')))
    .show()
)

+------+---+----+
|     a|  b|   c|
+------+---+----+
| kutya|123|null|
|bagoly| 13|null|
+------+---+----+

