See ticket for details: [#1826](https://github.com/opentargets/platform/issues/1826)


## Update cell annotation

There is a file in bucket: `gs://gs://otar000-evidence_input/CRISPR/data_files` with a very slim annotation around cell lines. This needs to be enriched with uberon codes and sanger cell line identifiers

| tissue | tissueId | uberonLabel |
|-----------------------------|----------------|---------------------------|
| Skin | UBERON_0002097 | skin of body |
| Prostate | UBERON_0002367 | prostate gland |
| Head and Neck | UBERON_0000033 | head |
| Head and Neck | UBERON_0000974 | neck |
| Bone | UBERON_0002481 | bone tissue |
| Biliary Tract | UBERON_0001173 | biliary tree |
| Haematopoietic and Lymphoid | UBERON_0002390 | hematopoietic system |
| Haematopoietic and Lymphoid | UBERON_0001744 | lymphoid tissue |
| Soft Tissue | UBERON_0002385 | muscle tissue |
| Soft Tissue | UBERON_0000043 | tendon |
| Soft Tissue | UBERON_0000211 | ligament |
| Soft Tissue | UBERON_0001013 | adipose tissue |
| Soft Tissue | UBERON_0011824 | fibrous connective tissue |
| Soft Tissue | UBERON_0002391 | lymph |
| Soft Tissue | UBERON_0001981 | blood vessel |
| Soft Tissue | UBERON_0008982 | fascia |

In [1]:
import requests
import pandas as pd

# | tissue                      |
# |:----------------------------|
# | Skin                        |
# | Prostate                    |
# | Head and Neck               |
# | Bone                        |
# | Biliary Tract               |
# | Haematopoietic and Lymphoid |
# | Soft Tissue                 |
UBERON_CUSTOM_CURATION = {
  "Skin": "UBERON_0002097",
  "Prostate": "UBERON_0002367",
  "Head and Neck": "UBERON_0000033",
  "Head and Neck": "UBERON_0000974",
  "Bone": "UBERON_0002481",
  "Biliary Tract": "UBERON_0001173",
  "Haematopoietic and Lymphoid": "UBERON_0002390",
  "Haematopoietic and Lymphoid": "UBERON_0001744",
  "Soft Tissue": "UBERON_0002385",
  "Soft Tissue": "UBERON_0000043",
  "Soft Tissue": "UBERON_0000211",
  "Soft Tissue": "UBERON_0001013",
  "Soft Tissue": "UBERON_0011824",
  "Soft Tissue": "UBERON_0002391",
  "Soft Tissue": "UBERON_0001981",
  "Soft Tissue": "UBERON_0008982"
}

def uberon_lookup(label: str)-> str:
    '''Retrieving uberon identifier of a label from OLS, assuming perfect match.'''
    
    if not label:
        return None

    label = label.lower()
    url = f'https://www.ebi.ac.uk/ols/api/search?q={label}&queryFields=label&ontology=uberon&exact=true'
    
    # Parsing:
    try:
        data = requests.get(url).json()
        uberon_id = data['response']['docs'][0]['short_form']
        return uberon_id
    except IndexError:
        return None
    except KeyError:
        return None
    except ConnectionError:
        return None
    
# Loading cell description from project score:
crispr_cell_description = (
    pd.read_csv('/Users/dsuveges/project_data/crispr_data/crispr_cell_lines.tsv', sep='\t')
    .rename(columns={
        'Name': 'name',
        'Tissue': 'tissue',
        'Cancer Type': 'diseaseFromSource'
    })
)

print(f'Number of cell lines in the crispr cell lines: {len(crispr_cell_description)}')
print(f'Number of cell lines with no tissue: {len(crispr_cell_description.loc[crispr_cell_description.tissue.isna()])}')


# Extract unique list of tissues and map to uberon:
annotated_tissues = (
    crispr_cell_description
    [['tissue']]
    .drop_duplicates()
    .assign(tissueId = lambda df: df.tissue.apply(uberon_lookup))
)

print(f'Number of unique tissues: {len(annotated_tissues)}')
print(f'Number of tissues with no uberon mapping: {len(annotated_tissues.loc[annotated_tissues.tissueId.isna()])}')


# Fetching cell model data from Sanger:
cell_models =(
    pd.read_csv('https://cog.sanger.ac.uk/cmp/download/model_list_20210719.csv')
    [['model_id', 'model_name']]
    .rename(columns={
        'model_id': 'id',
        'model_name': 'name'
    })
    .drop_duplicates()
)
print(f'Number of cell models: {len(cell_models)}')


# Finalising and saving data:
crispr_cell_description = (
    crispr_cell_description
    
    # Merging uberon annotation and cell line identifiers with cell lines:
    .merge(annotated_tissues, on='tissue', how='left')

    # Merging cell line identifiers with cell lines:
    .merge(cell_models, on='name', how='left')

)

print(f'Number of cell lines at the end: {len(crispr_cell_description)}')
print(f'Number of cell lines with no identifier: {len(crispr_cell_description.loc[crispr_cell_description.id.isna()])}')
print(f'Number of cell lines with no uberon: {len(crispr_cell_description.loc[crispr_cell_description.tissueId.isna()])}')

# Saving enriched data file:
(
    crispr_cell_description
    .to_csv('crispr_cell_lines_enriched_2021-10-22.tsv', sep='\t', index=False)
)

Number of cell lines in the crispr cell lines: 336
Number of cell lines with no tissue: 0
Number of unique tissues: 19
Number of tissues with no uberon mapping: 7
Number of cell models: 2007
Number of cell lines at the end: 336
Number of cell lines with no identifier: 0
Number of cell lines with no uberon: 69


In [4]:
print(crispr_cell_description.loc[crispr_cell_description.tissueId.isna()].tissue.drop_duplicates().to_markdown(index=False))


| tissue                      |
|:----------------------------|
| Skin                        |
| Prostate                    |
| Head and Neck               |
| Bone                        |
| Biliary Tract               |
| Haematopoietic and Lymphoid |
| Soft Tissue                 |


## Evidence generation:

Utilizing the above generated dataset, with pyspark, join and process the data and save evidence:

In [80]:
# import argparse
# import logging
# import sys
import requests

from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, FloatType
from pyspark.sql.functions import col, udf, collect_set, split, element_at, struct, trim, lit

sparkConf = (
    SparkConf()
    .set('spark.driver.memory', '15g')
    .set('spark.executor.memory', '15g')
    .set('spark.driver.maxResultSize', '0')
    .set('spark.debug.maxToStringFields', '2000')
    .set('spark.sql.execution.arrow.maxRecordsPerBatch', '500000')
)
spark = (
    SparkSession.builder
    .config(conf=sparkConf)
    .master('local[*]')
    .getOrCreate()
)

# input files: 
evid_file = '/Users/dsuveges/project_data/crispr_data/crispr_evidence.tsv'
desc_file = '/Users/dsuveges/project_data/crispr_data/crispr_descriptions.tsv'
cell_file = 'crispr_cell_lines_enriched_2021-10-22.tsv'

In [28]:
evidence

+--------+--------------------+---------------+--------------------+-------------+-------+
|    pmid|       gene_set_name|      target_id|          disease_id| disease_name|  score|
+--------+--------------------+---------------+--------------------+-------------+-------+
|30971826|Project Score: Pr...|ENSG00000110092|http://www.ebi.ac...|Ewing sarcoma|79.7813|
|30971826|Project Score: Pr...|ENSG00000130725|http://www.ebi.ac...|Ewing sarcoma|   58.5|
|30971826|Project Score: Pr...|ENSG00000111142|http://www.ebi.ac...|Ewing sarcoma|57.9375|
|30971826|Project Score: Pr...|ENSG00000152234|http://www.ebi.ac...|Ewing sarcoma| 54.125|
|30971826|Project Score: Pr...|ENSG00000165501|http://www.ebi.ac...|Ewing sarcoma|   52.5|
|30971826|Project Score: Pr...|ENSG00000143774|http://www.ebi.ac...|Ewing sarcoma| 52.375|
|30971826|Project Score: Pr...|ENSG00000147162|http://www.ebi.ac...|Ewing sarcoma| 52.375|
|30971826|Project Score: Pr...|ENSG00000109971|http://www.ebi.ac...|Ewing sarcoma|  52.25|

In [5]:
# cell_models =(
#     pd.read_csv('https://cog.sanger.ac.uk/cmp/download/model_list_20210719.csv')
#     .rename(columns={
#         'model_id': 'id',
#         'model_name': 'name'
#     })
#     .drop_duplicates()
# )

cell_models.loc[cell_models.]

Unnamed: 0,id,name,synonyms,model_type,growth_properties,doi,pmed,model_treatment,model_comments,msi_status,...,smoking_status,model_relations_comment,COSMIC_ID,BROAD_ID,CCLE_ID,RRID,HCMI,suppliers,supplier,cat_number
0,SIDM01274,HCM-SANG-0314-C15,OESO_167,Organoid,,,,,,MSS,...,Ex-Smoker,,,,,,,,,
1,SIDM01276,HCM-SANG-0272-C20,COLO_025,Organoid,Unknown,,,,,MSS,...,Ex-Smoker,,,,,,,,,
2,SIDM01280,HCM-SANG-0310-C15,OESO_148,Organoid,Unknown,,,,,MSS,...,Ex-Smoker,,,,,,,,,
3,SIDM00065,HLF,,Cell Line,Unknown,,,,,,...,Unknown,HLE and HLF were derived from the same patient...,998181,ACH-000393,HLF_LIVER,CVCL_2947,,JCRB:JCRB0405,JCRB,JCRB0405
4,SIDM00066,TE-7,,Cell Line,Unknown,,,,,,...,Unknown,"TE-2, TE-3, TE-7, TE-12 & TE-13 share common a...",,,,CVCL_9972,,unknown:unknown,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2002,SIDM01953,HCM-SANG-0519-C20,COLO_214,Organoid,Unknown,,,Unknown,,,...,Unknown,,,,,,,,,
2003,SIDM01955,HCM-SANG-0534-C18,COLO_163,Organoid,Unknown,,,Unknown,,,...,Unknown,,,,,,,,,
2004,SIDM01947,HCM-SANG-0547-C25,PANC_050,Organoid,Unknown,,,Unknown,,,...,Unknown,,,,,,,,,
2005,SIDM01952,HCM-SANG-0548-C25,PANC_052,Organoid,Unknown,,,Unknown,,,...,Unknown,,,,,,,,,


In [30]:
(
    spark.read.csv(cell_file, sep='\t', header=True)
    .show()
    
)

+--------+--------------------+--------------------+--------------+---------+
|    name|              tissue|   diseaseFromSource|      tissueId|       id|
+--------+--------------------+--------------------+--------------+---------+
|    A375|                Skin|            Melanoma|          null|SIDM00795|
|  HCT-15|     Large Intestine|Colorectal Carcinoma|UBERON_0000059|SIDM00789|
|   HT-29|     Large Intestine|Colorectal Carcinoma|UBERON_0000059|SIDM00136|
|  HCC-78|                Lung| Lung Adenocarcinoma|UBERON_0002048|SIDM01068|
|   SW620|     Large Intestine|Colorectal Carcinoma|UBERON_0000059|SIDM00841|
|   22RV1|            Prostate|  Prostate Carcinoma|          null|SIDM00499|
|23132-87|             Stomach|   Gastric Carcinoma|UBERON_0000945|SIDM00980|
|   769-P|              Kidney|    Kidney Carcinoma|UBERON_0002113|SIDM00803|
|   A2058|                Skin|            Melanoma|          null|SIDM00797|
|    A253|       Head and Neck|Head and Neck Car...|          nu

In [49]:
# Reading datasets:
evidence_df = (
    spark.read.csv(evid_file, sep='\t', header=True)
    .drop('pmid', 'gene_set_name', 'disease_name')
)
cell_lines_df = spark.read.csv(cell_file, sep='\t', header=True)
description_df = spark.read.csv(desc_file, sep='\t', header=True)

# Combining description and cell lines based on tissue:
tissue_desc = (
    description_df
    .withColumnRenamed('tissue_or_cancer_type', 'tissue')
    .join(cell_lines_df, on='tissue', how='inner')
)

# Combining description and cells based on disease
cell_desc = (
    description_df
    .withColumnRenamed('tissue_or_cancer_type', 'diseaseFromSource')
    .join(cell_lines_df, on='diseaseFromSource', how='inner')
)

# Concatenating dataframes:
merged_annotation = (
    cell_desc
    .union(tissue_desc)
    
    # Aggregating by diease and method:
    .groupBy('diseaseFromSource', 'efo_id', 'method')
    
    # The aggregated cell object is a list of struct:
    .agg(collect_set(struct(
        col('name'), col('id'), col('tissue'), col('tissueId')
    )).alias('diseaseCellLines'))
    .drop('method')
)

In [37]:
(
    cell_desc
    .union(tissue_desc)
    .show()
)

+--------------------+--------------------+--------------------+--------+--------------------+--------------+---------+
|   diseaseFromSource|              efo_id|              method|    name|              tissue|      tissueId|       id|
+--------------------+--------------------+--------------------+--------+--------------------+--------------+---------+
|Colorectal Carcinoma|http://www.ebi.ac...|147 targets were ...|  HCT-15|     Large Intestine|UBERON_0000059|SIDM00789|
|Colorectal Carcinoma|http://www.ebi.ac...|147 targets were ...|   HT-29|     Large Intestine|UBERON_0000059|SIDM00136|
| Lung Adenocarcinoma|http://www.ebi.ac...|85 targets were p...|  HCC-78|                Lung|UBERON_0002048|SIDM01068|
|Colorectal Carcinoma|http://www.ebi.ac...|147 targets were ...|   SW620|     Large Intestine|UBERON_0000059|SIDM00841|
|   Gastric Carcinoma|http://www.ebi.ac...|168 targets were ...|23132-87|             Stomach|UBERON_0000945|SIDM00980|
|Head and Neck Car...|http://purl.oboli.

In [81]:
CRISPR_SYMBOL_MAPPING = {
    'CASC5': 'ENSG00000137812',
    'CIRH1A': 'ENSG00000141076',
    'EFTUD1': 'ENSG00000140598',
    'ENSG00000163660': 'ENSG00000163660',
    'KIAA0947': 'ENSG00000164151',
    'KIAA1432': 'ENSG00000107036',
    'NDNL2': 'ENSG00000185115',
    'SRPR': 'ENSG00000182934',
    'ZNF259': 'ENSG00000109917'
}


pooled_evidence_df = (
    evidence_df
    .select(
        col('target_id').alias('targetFromSourceId'), 
        col('disease_id').alias('efo_id'),
        col('score').alias('resourceScore').cast(FloatType()),
    )
    
    # Some of the target identifier are not Ensembl Gene id - replace them:
    .replace(to_replace=CRISPR_SYMBOL_MAPPING, subset=['target_id'])
    
    # Merging with descriptions:
    .join(merged_annotation, on='efo_id', how='outer')
    
    # From EFO uri, generate EFO id:
    .withColumn('diseaseFromSourceMappedId', element_at(split(col('efo_id'), '/'), -1).alias('diseaseFromSourceMappedId'))
    .drop('efo_id')
    
    # Adding constants:
    .withColumn('datasourceId', lit('crispr'))
    .withColumn('datatypeId', lit('affected_pathway'))
    .persist()
)

pooled_evidence_df.show()

+------------------+-------------+-----------------+--------------------+-------------------------+------------+----------------+
|targetFromSourceId|resourceScore|diseaseFromSource|    diseaseCellLines|diseaseFromSourceMappedId|datasourceId|      datatypeId|
+------------------+-------------+-----------------+--------------------+-------------------------+------------+----------------+
|   ENSG00000121879|        73.75| Breast Carcinoma|[{OCUB-M, SIDM002...|              EFO_0000305|      crispr|affected_pathway|
|   ENSG00000141736|        71.25| Breast Carcinoma|[{OCUB-M, SIDM002...|              EFO_0000305|      crispr|affected_pathway|
|   ENSG00000065361|         70.5| Breast Carcinoma|[{OCUB-M, SIDM002...|              EFO_0000305|      crispr|affected_pathway|
|   ENSG00000129514|      64.8333| Breast Carcinoma|[{OCUB-M, SIDM002...|              EFO_0000305|      crispr|affected_pathway|
|   ENSG00000107485|      62.8125| Breast Carcinoma|[{OCUB-M, SIDM002...|              EFO

In [82]:
(
    pooled_evidence_df
    .coalesce(1)
    .write.format('json').mode('overwrite').option('compression', 'gzip').save('crispr-2021-10-22')
)

## Comparing old vs new evidence

In [83]:
evidence_files = {
    'old (21.09)' : '/Users/dsuveges/project_data/crispr_data/crispr-2021-09-07.json.gz',
    'new (21.11)' : 'crispr-2021-10-22'
}

collected_data = dict()

for name, file in evidence_files.items():

    # load data into spark:
    df = spark.read.json(file).persist()
    
    print(df.printSchema())
    collected_data[name] = {
        'file': file.split('/')[-1],
        'evidence': df.count(),
        'target' : df.select('targetFromSourceId').distinct().count(),
        'disease' : df.select('diseaseFromSourceMappedId').distinct().count(),
        'association': df.select('diseaseFromSourceMappedId', 'targetFromSourceId').distinct().count()
    }
    
print(pd.DataFrame(collected_data).to_markdown())

root
 |-- datasourceId: string (nullable = true)
 |-- datatypeId: string (nullable = true)
 |-- diseaseCellLines: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- diseaseFromSource: string (nullable = true)
 |-- diseaseFromSourceMappedId: string (nullable = true)
 |-- resourceScore: double (nullable = true)
 |-- targetFromSourceId: string (nullable = true)

None
root
 |-- datasourceId: string (nullable = true)
 |-- datatypeId: string (nullable = true)
 |-- diseaseCellLines: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- tissue: string (nullable = true)
 |    |    |-- tissueId: string (nullable = true)
 |-- diseaseFromSource: string (nullable = true)
 |-- diseaseFromSourceMappedId: string (nullable = true)
 |-- resourceScore: double (nullable = true)
 |-- targetFromSourceId: string (nullable = true)

None
|             | old (21.09

In [79]:
%%bash

gzcat crispr-2021-10-22/*gz | head -n1 | jq

{
  "targetFromSourceId": "ENSG00000121879",
  "resourceScore": "73.75",
  "diseaseFromSource": "Breast Carcinoma",
  "diseaseCellLines": [
    {
      "name": "OCUB-M",
      "id": "SIDM00241",
      "tissue": "Breast",
      "tissueId": "UBERON_0000310"
    },
    {
      "name": "HCC1395",
      "id": "SIDM00884",
      "tissue": "Breast",
      "tissueId": "UBERON_0000310"
    },
    {
      "name": "HCC1143",
      "id": "SIDM00866",
      "tissue": "Breast",
      "tissueId": "UBERON_0000310"
    },
    {
      "name": "MDA-MB-468",
      "id": "SIDM00628",
      "tissue": "Breast",
      "tissueId": "UBERON_0000310"
    },
    {
      "name": "COLO-824",
      "id": "SIDM00954",
      "tissue": "Breast",
      "tissueId": "UBERON_0000310"
    },
    {
      "name": "MDA-MB-436",
      "id": "SIDM00629",
      "tissue": "Breast",
      "tissueId": "UBERON_0000310"
    },
    {
      "name": "MDA-MB-415",
      "id": "SIDM00630",
      "tissue": "Breast",
      "tissueId": "UBERON

In [94]:
import pandas as pd
lista = [['BP', 12], ['DE', 11], ['BP', 16], ['DE', 14], ['BP', 26]]
(
    pd.DataFrame(lista, columns= ['varos', 'homerseklet'])
    .groupby('varos')
    .agg(list)
)


Unnamed: 0_level_0,homerseklet
varos,Unnamed: 1_level_1
BP,"[12, 16, 26]"
DE,"[11, 14]"
