# Valiadation lab data

This is in the readme:

```
Description for the first set of Open Targets Validation Lab Oncology screen results.

This release contains only Cell Titre Glow (CTG) results for Colorectal cancer (CO) target genes.

The file CTG_CO_Partner-Preview-Matrix_v6a.txt is a tab delimited file with rows corresponding to targeted genes and the following columns:

gene, cell-line, effect-size, pass-fail, expected-to-pass

pass-fail is based on the effect-size score with, in this case a threshold of >= 38 being a pass. Effect-size may be used to order the entries as a proxy for "significance", but the pass-fail column should be used to determine Status (in future releases, pass-fail may not be determine solely using the effect size).

Expected-to-pass corresponds to whether or not this gene & cell-line combination was expected to validate based on the original selection criteria.

Bio-marker/meta-data annotation can be found in the file COlines.txt (the CO prefix indicating these are the annotations for Colorectal cell lines). Rows in this file correspond to cell-lines and each subsequent column corresponds to a biomarker/meta-data annotation which we would like to be presented (somehow!) for each of the rows in the CTG_CO_Partner-Preview-Matrix_v6a.txt file.
```

In [84]:
import pandas as pd

colo_cells = pd.read_csv('/Users/dsuveges/project_data/validation_lab/COlines.txt', sep='\t')

colo_cells

Unnamed: 0,CO_line,MS_status,CRIS_subtype,KRAS_status,TP53_status,APC_status
0,SW626,MSS,?,mut,mut,mut
1,HT29,MSS,B,wt,mut,mut
2,SW837,MSS,B,mut,mut,mut
3,MDST8,MSS,D,wt,wt,mut
4,HCT116,MSI,D,mut,wt,wt
5,KM12,MSI,A,wt,mut,wt
6,RKO,MSI,?,wt,wt,wt
7,LS180,MSI,A,mut,wt,mut


In [9]:
result = pd.read_csv('/Users/dsuveges/project_data/validation_lab/CTG_CO_Partner-Preview-Matrix_v6a.txt', sep='\t')
result.head()

Unnamed: 0,gene,cell-line,effect-size,pass-fail,expected-to-pass
0,ARHGEF7,SW626,59.9,True,True
1,ARHGEF7,HT29,1.79,False,False
2,ARHGEF7,SW837,17.31,False,False
3,ARHGEF7,MDST8,-6.49,False,True
4,ARHGEF7,HCT116,36.31,False,False


In [23]:
print(f'Unique genes: {len(result.gene.unique())}')
print(f'Lengther of results dataset: {len(result)}')
print(f"Number of experiments reaching significance: {len(result.loc[result['effect-size'] >= 38])}")
print(f"Number of genes reaching significance: {len(result.loc[result['effect-size'] >= 38].gene.unique())}")
print('Gene with no replication: ' + result.loc[result['effect-size'] < 38].gene.value_counts().loc[lambda x: x == 8].index[0])

Unique genes: 25
Lengther of results dataset: 200
Number of experiments reaching significance: 95
Number of genes reaching significance: 24
Gene with no replication: HSP90B1


In [30]:
# Using exclusively pyspark:
import pandas as pd
import json
import requests

from pyspark.conf import SparkConf
from pyspark.sql.types import ArrayType, StringType, IntegerType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, struct, lit, when, udf, array

sparkConf = (
    SparkConf()
    .set('spark.driver.memory', '15g')
    .set('spark.executor.memory', '15g')
    .set('spark.driver.maxResultSize', '0')
    .set('spark.debug.maxToStringFields', '2000')
    .set('spark.sql.execution.arrow.maxRecordsPerBatch', '500000')
)
spark = (
    SparkSession.builder
    .config(conf=sparkConf)
    .master('local[*]')
    .getOrCreate()
)


cell_lines_file = '/Users/dsuveges/project_data/validation_lab/COlines.txt'
cell_lines_annotation_file = '/Users/dsuveges/project_data/validation_lab/model_list_20211124.csv'
cell_cancer_driver_mutation = '/Users/dsuveges/project_data/validation_lab/mutations_summary_20211124.csv'
validation_file = '/Users/dsuveges/project_data/validation_lab/CTG_CO_Partner-Preview-Matrix_v6a.txt'

In [26]:
annot = (
    spark.read
    .option("multiline",True)
    .csv(cell_lines_annotation_file, header=True, sep=',', quote='"')
    .persist()
)

print(annot.columns)
annot.show(1, vertical=True)

['model_id', 'model_name', 'synonyms', 'model_type', 'growth_properties', 'doi', 'pmed', 'model_treatment', 'model_comments', 'msi_status', 'mutational_burden', 'ploidy', 'parent_id', 'crispr_ko_data', 'sample_id', 'tissue', 'tissue_status', 'cancer_type', 'cancer_type_detail', 'cancer_type_ncit_id', 'age_at_sampling', 'sampling_day', 'sampling_month', 'sampling_year', 'sample_treatment', 'sample_treatment_details', 'sample_site', 'tnm_t', 'tnm_n', 'tnm_m', 'tnm_integrated', 'tumour_grade', 'patient_id', 'species', 'gender', 'ethnicity', 'smoking_status', 'model_relations_comment', 'COSMIC_ID', 'BROAD_ID', 'CCLE_ID', 'RRID', 'HCMI', 'suppliers', 'supplier', 'cat_number']
-RECORD 0----------------------------------------
 model_id                 | SIDM01274            
 model_name               | HCM-SANG-0314-C15    
 synonyms                 | OESO_167             
 model_type               | Organoid             
 growth_properties        | NA                   
 doi                

In [27]:
# diseaseCellLines_model = (
#     annot
#     .withColumn(
#         'diseaseCellLines',
#         struct(
#             col('model_name').alias('name'), 
#             col('model_id').alias('id'), 
#             col('tissue')
#         )
#     )
#     .select(
#         col('model_name').alias('cellLine'),
#         'diseaseCellLines')
#     .persist()
# )

diseaseCellLines_model = (
    annot
    .select(
        col('model_name').alias('name'), 
        col('model_id').alias('id'), 
        col('tissue')
    )
)


diseaseCellLines_model.show()

+-----------------+---------+--------------------+
|             name|       id|              tissue|
+-----------------+---------+--------------------+
|HCM-SANG-0314-C15|SIDM01274|           Esophagus|
|HCM-SANG-0272-C20|SIDM01276|     Large Intestine|
|HCM-SANG-0310-C15|SIDM01280|           Esophagus|
|              HLF|SIDM00065|               Liver|
|             TE-7|SIDM00066|           Esophagus|
|HCM-SANG-0278-C20|SIDM01282|     Large Intestine|
|HCM-SANG-0277-C18|SIDM01284|     Large Intestine|
|            TE-13|SIDM00076|           Esophagus|
|      NCI-ADR-RES|SIDM00089|               Ovary|
|        ONCO-DG-1|SIDM00093|               Ovary|
|         NCI-H125|SIDM00106|                Lung|
|            SNB19|SIDM00109|Central Nervous S...|
|            B2-17|SIDM00110|Central Nervous S...|
|HCM-SANG-0276-C18|SIDM01278|     Large Intestine|
|         COLO-320|SIDM00071|     Large Intestine|
|             DiFi|SIDM00049|     Large Intestine|
|            HCE-4|SIDM00052|  

## Aggregating data:

In [44]:
validation_lab_cell_lines = (
    # Reading cell metadata from validation lab:
    spark.read.csv(cell_lines_file, sep='\t', header=True)
    
    # Renaming columns:
    .withColumnRenamed('CO_line', 'name')
    .withColumnRenamed('MS_status', 'microsatelliteStabilityStatus')
    
    # Updating some of the cell lines' name:
    .withColumn('name',
         when(col('name') == 'HT29', 'HT-29')
        .when(col('name') == 'HCT116', 'HCT-116')
        .when(col('name') == 'LS180', 'LS-180')
        .otherwise(col('name'))
    )
    
    # Updating some of the subtype:
    .withColumn('diseaseSubtype',
        when(col('CRIS_subtype') == '?', None)
        .otherwise(col('CRIS_subtype'))
    )
    
    # Joining dataset with cell model data read downloaded from Sanger website:
    .join(diseaseCellLines_model, on='name', how='left')
    
    # Adding UBERON code to tissues (it's constant colon)
    .withColumn('tissueID', lit('UBERON_0000059'))
    
    # generating disease cell lines object:
    .withColumn(
        'diseaseCellLines',
        array(struct(col('name'), col('id'), col('tissue'), col('tissueId')))
    )
    .drop(*['id', 'tissue', 'tissueId', 'CRIS_subtype'])
    .persist()
)


validation_lab_cell_lines.show()

+-------+-----------------------------+-----------+-----------+----------+--------------+--------------------+
|   name|microsatelliteStabilityStatus|KRAS_status|TP53_status|APC_status|diseaseSubtype|    diseaseCellLines|
+-------+-----------------------------+-----------+-----------+----------+--------------+--------------------+
|  SW626|                          MSS|        mut|        mut|       mut|          null|[{SW626, SIDM0116...|
|  HT-29|                          MSS|         wt|        mut|       mut|             B|[{HT-29, SIDM0013...|
|  SW837|                          MSS|        mut|        mut|       mut|             B|[{SW837, SIDM0083...|
|  MDST8|                          MSS|         wt|         wt|       mut|             D|[{MDST8, SIDM0052...|
|HCT-116|                          MSI|        mut|         wt|        wt|             D|[{HCT-116, SIDM00...|
|   KM12|                          MSI|         wt|        mut|        wt|             A|[{KM12, SIDM00150...|
|

In [63]:
consequences_map = {
    'ess_splice': 'SO_0001629',
    'frameshift': 'SO_0000865',
    'missense': 'SO_0001583',
    'inframe': 'SO_0001817',
    'stop_lost': 'SO_0001578',
    'nonsense': 'SO:0001587'
}

map_udf = udf(lambda x: consequences_map.get(x), StringType())

generate_name_udf = udf(lambda gene, variant: f'{gene}:{variant.split(".")[1]}' if variant else None, StringType())


# Parsing cancer driver mutations:
cancer_drivers = (
    spark
    .read.csv(cell_cancer_driver_mutation, sep=',', header=True)
    .select(
        col('model_name'),
        col('gene_symbol'), col('protein_mutation'), col('effect')
    )
    
    # Filtering for relevant genes:
    .filter(
        col('gene_symbol').isNotNull() &
        col('gene_symbol').isin(['KRAS','TP53', 'APC'])
    )
    
    # Map functional consequences to SO terms:
    .withColumn('functionalConsequenceId', map_udf(col('effect')))
    
    # Generate mutaion names:
    .withColumn(
        'variant', 
        struct(
            generate_name_udf(col('gene_symbol'), col('protein_mutation')).alias('name'),
            col('functionalConsequenceId')
        ))
    
    # Grouping by model name and gene_symbol:
    .groupby(['model_name'])
    .agg(
        collect_set(col('variant')).alias('variants')
    )
    .withColumnRenamed('model_name', 'name')
)

In [65]:
validation_lab_cell_lines = (
    validation_lab_cell_lines
    
    # Joining cancer driver mutations with cell lines:
    .join(cancer_drivers, on='name', how='left')
    
    # Dropping unused columns:
    .drop(*['KRAS_status', 'TP53_status', 'APC_status'])
)

+-------+-----------------------------+--------------+--------------------+--------------------+
|   name|microsatelliteStabilityStatus|diseaseSubtype|    diseaseCellLines|            variants|
+-------+-----------------------------+--------------+--------------------+--------------------+
|  SW626|                          MSS|          null|[{SW626, SIDM0116...|[{TP53:G262V, SO_...|
|  HT-29|                          MSS|             B|[{HT-29, SIDM0013...|[{TP53:R273H, SO_...|
|  SW837|                          MSS|             B|[{SW837, SIDM0083...|[{APC:R213*, SO:0...|
|  MDST8|                          MSS|             D|[{MDST8, SIDM0052...|[{APC:T1556fs*3, ...|
|HCT-116|                          MSI|             D|[{HCT-116, SIDM00...|[{KRAS:G13D, SO_0...|
|   KM12|                          MSI|             A|[{KM12, SIDM00150...|[{TP53:H179R, SO_...|
|    RKO|                          MSI|          null|[{RKO, SIDM01090,...|                null|
| LS-180|                     

In [77]:
evidence = (
    spark.read.csv(validation_file, sep='\t', header=True)
    .withColumnRenamed('gene', 'targetFromSource')
    .withColumnRenamed('cell-line', 'name')
    .withColumnRenamed('effect-size', 'resourceScore')
    .withColumn(
        'confidence',
        when(col('resourceScore') >= 38, lit('Passed validation'))
        .otherwise(lit('Failed validation'))
    )
    .join(validation_lab_cell_lines, on='name', how='left')
    .drop(*['name', 'pass-fail', 'expected-to-pass'])
)

In [81]:
evidence.coalesce(1).write.format('json').mode('overwrite').option('compression', 'gzip').save('validation.json')

In [83]:
%%bash

gzcat validation.json/*gz | jq


{
  "targetFromSource": "ARHGEF7",
  "resourceScore": "6.95",
  "confidence": "Failed validation",
  "microsatelliteStabilityStatus": "MSI",
  "diseaseSubtype": "A",
  "diseaseCellLines": [
    {
      "name": "KM12",
      "id": "SIDM00150",
      "tissue": "Large Intestine",
      "tissueId": "UBERON_0000059"
    }
  ],
  "variants": [
    {
      "name": "TP53:H179R",
      "functionalConsequenceId": "SO_0001583"
    },
    {
      "name": "APC:N1818fs*2",
      "functionalConsequenceId": "SO_0000865"
    },
    {
      "name": "TP53:V73fs*50",
      "functionalConsequenceId": "SO_0000865"
    }
  ]
}
{
  "targetFromSource": "BRCA2",
  "resourceScore": "59.68",
  "confidence": "Passed validation",
  "microsatelliteStabilityStatus": "MSI",
  "diseaseSubtype": "A",
  "diseaseCellLines": [
    {
      "name": "KM12",
      "id": "SIDM00150",
      "tissue": "Large Intestine",
      "tissueId": "UBERON_0000059"
    }
  ],
  "variants": [
    {
      "name": "TP53:H179R",
      "function

```json
{
  "targetFromSource": "UBE2C",
  "resourceScore": 4.02,
  "confidence": "Failed validation",
  "microsatelliteStabilityStatus": "MSI",
  "diseaseSubtype": "A",
  "diseaseCellLines": [
    {
      "name": "KM12",
      "id": "SIDM00150",
      "tissue": "Large Intestine",
      "tissueId": "UBERON_0000059"
    }
  ],
  "biomarkers": [
    {
      "name": "TP53mut",
      "variants": [
        {
          "name": "TP53:H179R",
          "functionalConsequenceId": "SO_0001583"
        },
        {
          "name": "TP53:V73fs*50",
          "functionalConsequenceId": "SO_0000865"
        }
      ]
    },
    {
      "name": "APCwt",
      "variants": [
        {
          "name": "APC:N1818fs*2",
          "functionalConsequenceId": "SO_0000865"
        }
      ]
    },
    {
      "name": "KRASwt"
    }
  ]
}
```