# Writing script

The previously designed steps needs to be captured in a single stand alone sript.

**Tasks:**
1. Identifying source dataset from the `~/project_data/encore/ENCORE_FILES_DEC_2021`
2. Identifying steps to process
3. Write up annotation of input files in a `config.json` file
4. Break down task into functions.


### Files to be used

Suggestions extracted from older documentation:
```
Use EXACT matches not the MISMATCH1 result.
Use _ALL report files as not all data got averaged in the _AVERAGE dataset! Analysis is still going.
```

Filtering GEMINI dataset:
`we should not use FDR for Gemini calls and apply a p-value threshold of p <= 0.05`

### Comments on data processing

By default we are using all the cell lines in the datasets. There's a problem though: some cell lines are repsented by multiple replicates.... how to handle these cases? 

* New field: experiment id containing the cell and replicate id. 
* When pairing lfc data with gemini/bliss data these needs to be tracked.

### Updates

Any updates might happening needs to be noted under [#1902](https://github.com/opentargets/platform/issues/1902), as the backend team needs to be notified as sooon as possible.


In [228]:
import json

# These are the parameters to process:
parameters = {
    "sharedMetadata": {
        # Filtering: p <= 0.05 and FDR <= 0.25 
        "logFoldChangeCutoffPVal": 0.05,
        "logFoldChangeCutoffFDR": 0.25,

        # Filtering: p <= 0.05 (also used for bliss)
        "interactionCutoffPVal": 0.05
        
    },
    "experiments": [
        {
            "dataset": "COLO1",
            "diseaseFromSourceMappedId": "EFO_1001951",
            "diseaseFromSource": "colorectal carcinoma",
            "logFoldChangeFile": "ANALYSIS/GENERAL_STATS/ENCORE_COLO1/SCALED_EXACT/RUNMERGED_SAMPLES/AVERAGE_CELL_LINE/All.gene.stats.annotated.txt",
            "geminiFile": "ANALYSIS/GEMINI_STATS/ENCORE_COLO1_FINAL/EXACT_SCALED/STRONG_ALL.annotated.txt",
            "blissFile": "ANALYSIS/BLISS_STATS/ENCORE_COLO1_FINAL/COLO1_EXACT_SCALED_ZSCORE.txt"
        },
        {
            "dataset": "COLO2",
            "diseaseFromSourceMappedId": "EFO_1001951",
            "diseaseFromSource": "colorectal carcinoma",
            "logFoldChangeFile": "ANALYSIS/GENERAL_STATS/ENCORE_COLO2/SCALED_EXACT/RUNMERGED_SAMPLES/AVERAGE_CELL_LINE/All.gene.stats.annotated.txt",
            "geminiFile": None,
            "blissFile": None
        },
        {
            "dataset": "COLO3",
            "diseaseFromSourceMappedId": "EFO_1001951",
            "diseaseFromSource": "colorectal carcinoma",
            "logFoldChangeFile": "ANALYSIS/GENERAL_STATS/ENCORE_COLO3/SCALED_EXACT/RUNMERGED_SAMPLES/AVERAGE_CELL_LINE/All.gene.stats.annotated.txt",
            "geminiFile": None,
            "blissFile": None
        },
        {
            "dataset": "BRCA1",
            "diseaseFromSourceMappedId": "EFO_0000305",
            "diseaseFromSource": "breast carcinoma",
            "logFoldChangeFile": "ANALYSIS/GENERAL_STATS/ENCORE_BRCA1/SCALED_EXACT/RUNMERGED_SAMPLES/AVERAGE_CELL_LINE/All.gene.stats.annotated.txt",
            "geminiFile": "ANALYSIS/GEMINI_STATS/ENCORE_BRCA1_FINAL/EXACT_SCALED/STRONG_ALL.annotated.txt",
            "blissFile": "ANALYSIS/BLISS_STATS/ENCORE_BRCA1_FINAL/BRCA1_EXACT_SCALED_ZSCORE.txt"
        },
        {
            "dataset": "BRCA2",
            "diseaseFromSourceMappedId": "EFO_0000305",
            "diseaseFromSource": "breast carcinoma",
            "logFoldChangeFile": None,
            "geminiFile": None,
            "blissFile": None
        },
        {
            "dataset": "BRCA3",
            "diseaseFromSourceMappedId": "EFO_0000305",
            "diseaseFromSource": "breast carcinoma",
            "logFoldChangeFile": None,
            "geminiFile": None,
            "blissFile": None
        },
    ]
}


# Exporting parameter set:
with open('encore_parameters.json', 'wt') as f:
    json.dump(parameters, f, indent=4)
    
    
# input is read from this folder:
input_folder = '/Users/dsuveges/project_data/encore/ENCORE_FILES_DEC_2021'

In [2]:
import requests
from functools import reduce
import pandas as pd
from pyspark.sql.functions import (
    col, udf, struct, lit, split, expr, collect_set, struct, 
    regexp_replace, min as pyspark_min, explode,
    array_contains, count, first, element_at
)
from pyspark.sql.types import FloatType, ArrayType, StructType, StructField, BooleanType, StringType
from pyspark.sql import SparkSession
from collections import defaultdict

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

def lookup_uberon(tissue_label: str) -> str:
    url = f'https://www.ebi.ac.uk/ols/api/search?q={tissue_label.lower()}&queryFields=label&ontology=uberon&exact=true'
    r = requests.get(url).json()
    
    if r['response']['numFound'] == 0:
        return None
    else:
        return r['response']['docs'][0]['short_form']

def generate_diseaseCellLines(cellPassPortFile):
    
    # loading cell line annotation data from Sanger:
    cell_df = (
        spark.read
        .option("multiline", True)
        .csv(cellPassPortFile, header=True, sep=',', quote='"')
        .withColumn('biomarkerList', parse_msi_status(col('msi_status')))
        .select(
            col('model_name').alias('name'),
            col('model_id').alias('id'),
            col('tissue'),
            col('biomarkerList')
        )
        .persist()
    )


    # Map tissue labels to tissue identifiers:
    tissues = (
        spark.createDataFrame( 
            cell_df
            .select('tissue')
            .distinct()
            .toPandas()
            .assign(
                tissueId = lambda df: df.tissue.apply(lookup_uberon)
            )
        )
        .persist()
    )

    # Joining with cell lines:
    return (
        cell_df
        .join(tissues, on='tissue', how='left')

        # Generating the diseaseCellLines object:
        .select('name', 'id', 'biomarkerList', struct(['tissue', 'name', 'id', 'tissueId']).alias('diseaseCellLines'))

        # Cleaning up cell line name from dashes:
        .withColumn('name', regexp_replace(col('name'), '-', ''))
        
        .persist()
    )

@udf(
    ArrayType(
        StructType([
            StructField('name', StringType()), 
            StructField('description', StringType())
        ])
    )
)
def parse_msi_status(status: str) -> dict:
    if status == 'MSI':
        return     [{"name": "MSI","description": "Microsatellite instable"}]
    if status == 'MSS':
        return     [{"name": "MSS","description": "Microsatellite stable"}]
    else:
        return None
    


@udf(BooleanType())
def is_significant(lfc_pval, lfc_fdr, gemini_pval):
    if (
        (lfc_fdr <= logFoldChangeCutoffFDR) &
        (lfc_pval <= logFoldChangeCutoffPVal) &
        (gemini_pval <= geminiCutoffPVal)
    ):
        return True
    else:
        return False


@udf(ArrayType(StructType([
    StructField("targetFromSourceId", StringType(), False),
    StructField("targetRole", StringType(), False),
    StructField("interactingTargetFromSourceId", StringType(), False),
    StructField("interactingTargetRole", StringType(), False),
])))
def parse_targets(gene_pair: str, gene_role: str) -> dict:
    genes = gene_pair.split('~')
    roles = [
        gene_role.replace('Combinations', '').lower(),
        'anchor'
    ]

    assert(len(genes) == 2)
    parsed = []

    for i, (gene, role) in enumerate(zip(genes, roles)):
        parsed.append({
            'targetFromSourceId': gene,
            'targetRole': role,
            'interactingTargetFromSourceId': genes[1] if i == 0 else genes[0],
            'interactingTargetRole':  roles[1] if i == 0 else roles[0]
        })

    return parsed


def get_lfc_data(lfc_file):
    
    # Fixed statistical field names:
    stats_fields = ['p-value', 'fdr', 'lfc']

    # Reading the data into a single dataframe:
    lfc_df = spark.read.csv(lfc_file, sep=' ', header=True)
    
    # Collect the cell lines from the lfc file header:
    cell_lines = set(['_'.join(x.split('_')[:-1]) for x in lfc_df.columns[4:]])

    # Generating struct for each cell lines:
    # SIDM00049_CSID1053_p-value
    # SIDM00049_CSID1053_fdr
    # SIDM00049_CSID1053_lfc
    # Into: SIDM00049_CSID1053: struct(p-value, fdr, lfc)
    expressions = map(lambda cell: (cell, struct([col(f'{cell}_{x}').cast(FloatType()).alias(x) for x in stats_fields])), cell_lines)

    # Applying map on the dataframe:
    res_df = reduce(lambda DF,value: DF.withColumn(*value) , expressions, lfc_df)

    # Stack the previously generated columns:
    unpivot_expression = f'''stack({len(cell_lines)}, {", ".join([f"'{x}', {x}" for x in cell_lines])} ) as (cellLineName, cellLineData)'''

    return (
        res_df

        # Unpivot:
        .select('id', 'Note1', 'Note2', expr(unpivot_expression))

        # Extracting the real model id:
        .select(
            'id', 'cellLineName', 'Note1', 'Note2', 
            col('cellLineData.lfc').alias('phenotypicConsequenceLogFoldChange'),
            col('cellLineData.p-value').alias('phenotypicConsequencePValue'),
            col('cellLineData.fdr').alias('phenotypicConsequenceFDR')
        )
    )

def get_bliss_data(blissFile):
    # Fixed statistical field names:
    stats_fields = ['zscore', 'pval']

    # Read data:
    bliss_df = spark.read.csv(blissFile, sep='\t', header=True)

    # Collect the cell lines from the lfc file header:
    cell_lines = set(['_'.join(x.split('_')[0:2]) for x in bliss_df.columns[4:] if x.startswith('SID')])

    gene_column = 'Gene_Pair'

    # Checking for missing columns for cell lines:
    missing_columns = [f'{cell}_{stat}' for cell in cell_lines for stat in stats_fields if f'{cell}_{stat}' not in bliss_df.columns]
    cells_to_drop = set(['_'.join(x.split('_')[:-1]) for x in missing_columns])

    if missing_columns:
        print(f'Missing columns: {", ".join(missing_columns)}')
        print(f'Dropping cell_lines: {", ".join(cells_to_drop)}')

        # Removing missing cell lines:
        cell_lines = [x for x in cell_lines if x not in cells_to_drop]


    # Generating struct for each cell lines:
    expressions = map(lambda cell: (cell, struct([col(f'{cell}_{x}').alias(x) for x in stats_fields])), cell_lines)

    # Applying map on the dataframe:
    res_df = reduce(lambda DF,value: DF.withColumn(*value) , expressions, bliss_df)

    # Stack the previously generated columns:
    unpivot_expression = f'''stack({len(cell_lines)}, {", ".join([f"'{x}', {x}" for x in cell_lines])} ) as (cellLineName, cellLineData)'''

    return (
        res_df

        # Create a consistent id column:
        .withColumn('id', regexp_replace(col(gene_column), ';', '~'))

        # Unpivot:
        .select('id', expr(unpivot_expression))

        # Extracting the real model id:
        .select(
            'id', regexp_replace('cellLineName', '_strong', '').alias('cellLineName'),
            col('cellLineData.zscore').cast(FloatType()).alias('geneticInteractionScore'),
            col('cellLineData.pval').cast(FloatType()).alias('geneticInteractionPValue'),
        )
        .withColumn('geneticInteractionMethod', lit('bliss'))
    )

def get_gemini_data(gemini_file):
    
    # Fixed statistical field names:
    stats_fields = ['score', 'pval', 'FDR']

    # Reading the data into a single dataframe:
    gemini_df = spark.read.csv(gemini_file, sep=' ', header=True)
    
    # Collect the cell lines from the lfc file header:
    cell_lines = set(['_'.join(x.split('_')[:-1]) for x in gemini_df.columns[4:] if x.startswith('SID')])

    # This is an ugly way to troubleshooting irregularities in the data:
    if 'Gene_Pair' in gemini_df.columns:
        gene_column = 'Gene_Pair'
    elif 'Gene_Pair0' in gemini_df.columns:
        gene_column = 'Gene_Pair0'
    else:
        raise ValueError( f'No Gene_Pair column in Gemini data: {",".join(gemini_df.columns)}')
        
    # This is a mindfck, but there are potential missing columns in the datafile...
    # We'll create these columns with nulls.... I don't believe.
    missing_columns = [f'{cell}_{stat}' for cell in cell_lines for stat in stats_fields if f'{cell}_{stat}' not in gemini_df.columns]
    cells_to_drop = set(['_'.join(x.split('_')[:-1]) for x in missing_columns])
    if missing_columns:
        print(f'Missing columns: {", ".join(missing_columns)}')
        print(f'Dropping cell_lines: {", ".join(cells_to_drop)}')

        # Removing missing cell lines:
        cell_lines = [x for x in cell_lines if x not in cells_to_drop]
    
    # Generating struct for each cell lines:
    expressions = map(lambda cell: (cell, struct([col(f'{cell}_{x}').alias(x) for x in stats_fields])), cell_lines)

    # Applying map on the dataframe:
    res_df = reduce(lambda DF,value: DF.withColumn(*value) , expressions, gemini_df)

    # Stack the previously generated columns:
    unpivot_expression = f'''stack({len(cell_lines)}, {", ".join([f"'{x}', {x}" for x in cell_lines])} ) as (cellLineName, cellLineData)'''

    return (
        res_df
        
        # Create a consistent id column:
        .withColumn('id', regexp_replace(col(gene_column), ';', '~'))

        # Unpivot:
        .select('id', expr(unpivot_expression))

        # Extracting the real model id:
        .select(
            'id', regexp_replace('cellLineName', '_strong', '').alias('cellLineName'),
            col('cellLineData.score').cast(FloatType()).alias('geneticInteractionScore'),
            col('cellLineData.pval').cast(FloatType()).alias('geneticInteractionPValue'),
            col('cellLineData.FDR').cast(FloatType()).alias('geneticInteractionFDR')
        )
        .withColumn('geneticInteractionMethod', lit('gemini'))
    )




In [222]:

# Extracting parameters:
logFoldChangeCutoffPVal = parameters['sharedMetadata']['logFoldChangeCutoffPVal']
logFoldChangeCutoffFDR = parameters['sharedMetadata']['logFoldChangeCutoffFDR']
geminiCutoffPVal = parameters['sharedMetadata']['geminiCutoffPVal']
diseaseFromSourceMappedId = parameters['experiments'][0]['diseaseFromSourceMappedId']
diseaseFromSource = parameters['experiments'][0]['diseaseFromSource']

# Generate cell line table:
cellPassportFile = '/Users/dsuveges/project_data/encore/ENCORE_FILES_DEC_2021/model_list_20220124.csv'
cellpassportMap = generate_diseaseCellLines(cellPassportFile)

# experiment = COLO1 
dataFolder = '/Users/dsuveges/project_data/encore/ENCORE_FILES_DEC_2021'
experiment = parameters['experiments'][0]

## Reading lfc data:
lfc_file = f'{dataFolder}/{experiment["logFoldChangeFile"]}'
lfc_df = get_lfc_data(lfc_file)

## Reading gemini data:
gemini_file = f'{dataFolder}/{experiment["geminiFile"]}'
gemini_df = get_gemini_data(gemini_file)

## Reading bliss data:
bliss_file = dataFolder +'/' + parameters['experiments'][0]['blissFile']
bliss_df = get_bliss_data(bliss_file)

## Merging lfc + gemini:
merged_dataset = (
    lfc_df
    
    # Data is joined by the gene-pair and cell line:
    # .join(bliss_df, how='inner', on=['id', 'cellLineName'])
    .join(gemini_df, how='inner', on=['id', 'cellLineName'])
    
    # Applying filters on logFoldChange + gemini p-value:
    .filter( 
        (col('phenotypicConsequencePValue') <= logFoldChangeCutoffPVal) &
        (col('phenotypicConsequenceFDR') <= logFoldChangeCutoffFDR) &
        (col('geneticInteractionPValue') <= geminiCutoffPVal)
    )
    
    # Cleaning the cell line annotation:
    .withColumn('cellId', split(col('cellLineName'), '_').getItem(0))
    
    # Adding cell line and biomarker info:
    .join(cellpassportMap.select(col('id').alias('cellId'), 'diseaseCellLines', 'biomarkerList'), on='cellId')

)


merged_dataset.show(1, vertical=True, truncate=False)

Missing columns: SIDM00776_CSID1064_strong_FDR, SIDM00957_CSID1061_strong_pval
Dropping cell_lines: SIDM00776_CSID1064_strong, SIDM00957_CSID1061_strong
-RECORD 0----------------------------------------------------------------------------------
 cellId                             | SIDM00193                                            
 id                                 | KIF14~PRMT1                                          
 cellLineName                       | SIDM00193_CSID1035                                   
 Note1                              | LibraryCombinations                                  
 Note2                              | LibraryCombinations                                  
 phenotypicConsequenceLogFoldChange | -1.1626                                              
 phenotypicConsequencePValue        | 0.030703                                             
 phenotypicConsequenceFDR           | 0.227511                                             
 geneticInteraction

### Adding the decoration for the evidence

In [159]:
cellpassportMap.show()
    

+----------------+---------+--------------------+--------------------+
|            name|       id|       biomarkerList|    diseaseCellLines|
+----------------+---------+--------------------+--------------------+
|            RH18|SIDM00454|[{MSS, Microsatel...|{Soft Tissue, RH-...|
|              RD|SIDM00847|[{MSS, Microsatel...|{Soft Tissue, RD,...|
|         SCCH196|SIDM00031|                null|{Soft Tissue, SCC...|
|          TE125T|SIDM01763|                null|{Soft Tissue, TE-...|
|          MFHino|SIDM00299|[{MSS, Microsatel...|{Soft Tissue, MFH...|
|             RKN|SIDM00353|[{MSS, Microsatel...|{Soft Tissue, RKN...|
|            KYM1|SIDM00552|[{MSS, Microsatel...|{Soft Tissue, KYM...|
|           PEAZ1|SIDM00830|                null|{Soft Tissue, PEA...|
|          Hs633T|SIDM00667|[{MSS, Microsatel...|{Soft Tissue, Hs-...|
|              HX|SIDM01213|                null|{Soft Tissue, HX,...|
|            A204|SIDM00798|[{MSS, Microsatel...|{Soft Tissue, A20...|
|     

In [223]:
evidence_df = (
    merged_dataset
    
    # Cleaning the cell line annotation:
    .withColumn('cellId', split(col('cellLineName'), '_').getItem(0))

    # Parsing/exploding gene names and target roles:
    .withColumn('id', parse_targets(col('id'), col('Note1')))
    .select('*', explode(col('id')).alias('genes'))
    .select('*', col('genes.*'))

    # Adding some literals specific for this type of evidence:
    .withColumn('datatypeId', lit('ot_partner'))
    .withColumn('datasourceId', lit('encore'))
    .withColumn('projectId', lit('OTAR2062'))
    .withColumn('projectDescription', lit('Encore project')) ### TODO - fix this!
    .withColumn('geneInteractionType', lit('cooperative'))
    
    # Map disease to efo:
    .withColumn('diseaseFromSourceMappedId', lit(diseaseFromSourceMappedId))
    .withColumn('diseaseFromSource', lit(diseaseFromSource))
    
    # Removing duplications:
    .drop(*['cellLineName', 'cellId', 'Note1', 'Note2', 'hit', 'id', 'genes'])
    .distinct()        
    
    .persist()
)

evidence_df.show(1, vertical=True, truncate=False)

-RECORD 0----------------------------------------------------------------------------------
 phenotypicConsequenceLogFoldChange | -1.1626                                              
 phenotypicConsequencePValue        | 0.030703                                             
 phenotypicConsequenceFDR           | 0.227511                                             
 geneticInteractionScore            | 1.0196909                                            
 geneticInteractionPValue           | 0.04757625                                           
 geneticInteractionFDR              | 0.42128566                                           
 geneticInteractionMethod           | gemini                                               
 diseaseCellLines                   | {Large Intestine, SNU-81, SIDM00193, UBERON_0000059} 
 biomarkerList                      | [{MSS, Microsatellite stable}]                       
 targetFromSourceId                 | PRMT1                                     

In [229]:
evidence_df.printSchema()

root
 |-- phenotypicConsequenceLogFoldChange: float (nullable = true)
 |-- phenotypicConsequencePValue: float (nullable = true)
 |-- phenotypicConsequenceFDR: float (nullable = true)
 |-- geneticInteractionScore: float (nullable = true)
 |-- geneticInteractionPValue: float (nullable = true)
 |-- geneticInteractionFDR: float (nullable = true)
 |-- geneticInteractionMethod: string (nullable = false)
 |-- diseaseCellLines: struct (nullable = false)
 |    |-- tissue: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- tissueId: string (nullable = true)
 |-- biomarkerList: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- description: string (nullable = true)
 |-- targetFromSourceId: string (nullable = true)
 |-- targetRole: string (nullable = true)
 |-- interactingTargetFromSourceId: string (nullable = true)
 |-- interactingTargetRole: string (nu

In [162]:
evidence_df.columns

['phenotypicConsequenceLogFoldChange',
 'phenotypicConsequencePValue',
 'phenotypicConsequenceFDR',
 'geneticInteractionScore',
 'geneticInteractionPValue',
 'geneticInteractionFDR',
 'geneticInteractionMethod',
 'diseaseCellLines',
 'biomarkerList',
 'targetFromSourceId',
 'targetRole',
 'interactingTargetFromSourceId',
 'interactingTargetRole',
 'datatypeId',
 'datasourceId',
 'projectId',
 'projectDescription',
 'diseaseFromSourceMappedId',
 'diseaseFromSource']

In [225]:
import tempfile
import os


def write_evidence_strings(evidence, output_file):
    """Exports the table to a compressed JSON file containing the evidence strings."""
    with tempfile.TemporaryDirectory() as tmp_dir_name:
        (
            evidence.coalesce(1).write.format('json').mode('overwrite')
            .option('compression', 'org.apache.hadoop.io.compress.GzipCodec').save(tmp_dir_name)
        )
        json_chunks = [f for f in os.listdir(tmp_dir_name) if f.endswith('.json.gz')]
        assert len(json_chunks) == 1, f'Expected one JSON file, but found {len(json_chunks)}.'
        os.rename(os.path.join(tmp_dir_name, json_chunks[0]), output_file)

        
write_evidence_strings(evidence_df, 'encore-2022-02-10.json.gz')


In [227]:
%%bash


gzcat encore-2022-02-10.json.gz | sort -R | head -n1 | jq

{
  "phenotypicConsequenceLogFoldChange": -1.4731,
  "phenotypicConsequencePValue": 0.026617,
  "phenotypicConsequenceFDR": 0.211739,
  "geneticInteractionScore": 1.2418852,
  "geneticInteractionPValue": 0.04065454,
  "geneticInteractionFDR": 0.29073796,
  "geneticInteractionMethod": "gemini",
  "diseaseCellLines": {
    "tissue": "Large Intestine",
    "name": "SW1463",
    "id": "SIDM00834",
    "tissueId": "UBERON_0000059"
  },
  "biomarkerList": [
    {
      "name": "MSS",
      "description": "Microsatellite stable"
    }
  ],
  "targetFromSourceId": "PRMT1",
  "targetRole": "anchor",
  "interactingTargetFromSourceId": "MAPK7",
  "interactingTargetRole": "library",
  "datatypeId": "ot_partner",
  "datasourceId": "encore",
  "projectId": "OTAR2062",
  "projectDescription": "Encore project",
  "geneInteractionType": "cooperative",
  "diseaseFromSourceMappedId": "EFO_1001951",
  "diseaseFromSource": "colorectal carcinoma"
}


In [176]:
lfc_df.select('phenotypicConsequenceLogFoldChange').describe().show()

+-------+----------------------------------+
|summary|phenotypicConsequenceLogFoldChange|
+-------+----------------------------------+
|  count|                            671022|
|   mean|               -0.3865246497843473|
| stddev|                0.6880722851652828|
|    min|                           -10.047|
|    max|                            3.9859|
+-------+----------------------------------+



In [198]:
blissFile = dataFolder +'/' + parameters['experiments'][0]['blissFile']

bliss_df = get_bliss_data(blissFile)
bliss_df.filter(col('geneticInteractionPValue') <= 0.05).show()

+------------+------------------+-----------------------+------------------------+------------------------+
|          id|      cellLineName|geneticInteractionScore|geneticInteractionPValue|geneticInteractionMethod|
+------------+------------------+-----------------------+------------------------+------------------------+
|  ABCB1~AKT1|SIDM00193_CPID1065|              -2.078329|              0.03767907|                   bliss|
|  ABCB1~AKT1|SIDM00194_CPID1205|             -2.4500997|             0.014281668|                   bliss|
|  ABCB1~AKT1|SIDM00776_CPID1286|               -2.17611|              0.02954704|                   bliss|
|   ABCB1~APC|SIDM00359_CPID1199|             -2.2621548|             0.023687847|                   bliss|
|   ABCB1~APC|SIDM00778_CPID1280|              -2.589407|             0.009614142|                   bliss|
|ABCB1~ARID1A|SIDM00833_CPID1259|             -2.4585474|             0.013950037|                   bliss|
|ABCB1~ARID1A|SIDM00778_CPID

In [219]:
(
    lfc_df.select('cellLineName').distinct()
    .withColumn('cellLine', element_at(split(col('cellLineName'), '_'), 1))
    .withColumn('experiment', element_at(split(col('cellLineName'), '_'), 2))
    .groupBy('cellLine')
    .agg(collect_set(col('experiment')).alias('experiments_lfc'))
    .join(
        (
            gemini_df.select('cellLineName').distinct()
            .withColumn('cellLine', element_at(split(col('cellLineName'), '_'), 1))
            .withColumn('experiment', element_at(split(col('cellLineName'), '_'), 2))
            .groupBy('cellLine')
            .agg(collect_set(col('experiment')).alias('experiments_gemini'))
        ),
        on='cellLine', how='outer'
    )
    .show(truncate=False)
    
)

+---------+------------------------------------+------------------+
|cellLine |experiments_lfc                     |experiments_gemini|
+---------+------------------------------------+------------------+
|SIDM00832|[CSID1047]                          |null              |
|SIDM00193|[CSID1035]                          |[CSID1035]        |
|SIDM00835|[CSID1103]                          |[CSID1103]        |
|SIDM00681|[CSID1066, CSID1042]                |[CSID1066]        |
|SIDM00957|[CSID1061]                          |null              |
|SIDM00783|[CSID1027]                          |[CSID1027]        |
|SIDM00359|[CSID1057]                          |[CSID1057]        |
|SIDM00537|[CSID1025]                          |[CSID1025]        |
|SIDM00782|[CSID1039, CSID1062]                |null              |
|SIDM00680|[CSID1032]                          |[CSID1032]        |
|SIDM00136|[CSID1058, CSID1029, CSID1106, GIv1]|[CSID1106]        |
|SIDM00841|[CSID1037]                          |

In [220]:
(
    lfc_df.select('cellLineName').distinct()
    .withColumn('cellLine', element_at(split(col('cellLineName'), '_'), 1))
    .withColumn('experiment', element_at(split(col('cellLineName'), '_'), 2))
    .groupBy('cellLine')
    .agg(collect_set(col('experiment')).alias('experiments_lfc'))
    .join(
        (
            bliss_df.select('cellLineName').distinct()
            .withColumn('cellLine', element_at(split(col('cellLineName'), '_'), 1))
            .withColumn('experiment', element_at(split(col('cellLineName'), '_'), 2))
            .groupBy('cellLine')
            .agg(collect_set(col('experiment')).alias('experiments_bliss'))
        ),
        on='cellLine', how='outer'
    )
    .show(truncate=False)
    
)

+---------+------------------------------------+------------------------------+
|cellLine |experiments_lfc                     |experiments_bliss             |
+---------+------------------------------------+------------------------------+
|SIDM00832|[CSID1047]                          |null                          |
|SIDM00193|[CSID1035]                          |[CPID1068, CPID1071, CPID1065]|
|SIDM00835|[CSID1103]                          |[CPID1334, CPID1331, CPID1328]|
|SIDM00681|[CSID1066, CSID1042]                |[CPID1304, CPID1301, CPID1307]|
|SIDM00957|[CSID1061]                          |[CPID1223, CPID1226, CPID1220]|
|SIDM00783|[CSID1027]                          |[CPID1083, CPID1086, CPID1089]|
|SIDM00359|[CSID1057]                          |[CPID1193, CPID1199, CPID1196]|
|SIDM00537|[CSID1025]                          |[CPID1062, CPID1056, CPID1059]|
|SIDM00782|[CSID1039, CSID1062]                |null                          |
|SIDM00680|[CSID1032]                   

In [230]:
cellpassportMap.printSchema()

root
 |-- name: string (nullable = true)
 |-- id: string (nullable = true)
 |-- biomarkerList: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- description: string (nullable = true)
 |-- diseaseCellLines: struct (nullable = false)
 |    |-- tissue: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- tissueId: string (nullable = true)



In [235]:
filter(lambda x: x if x is not None else 'cica', [1, 3, None, 'cica'])

<filter at 0x161c31880>

In [8]:
(
    spark.read.json('/Users/dsuveges/project_data/validation_lab/2022.02/validationLab_2022-02-18.json.gz')
    .filter(col('validationHypotheses').isNull())
    .select('targetFromSourceId', 'diseaseCellLines', 'expectedConfidence', 'validationHypotheses')
    .show()
)

+------------------+--------------------+------------------+--------------------+
|targetFromSourceId|    diseaseCellLines|expectedConfidence|validationHypotheses|
+------------------+--------------------+------------------+--------------------+
|             ERBB2|[{SIDM01168, SW62...|   not significant|                null|
|             ERBB2|[{SIDM00136, HT29...|   not significant|                null|
|             ERBB2|[{SIDM00833, SW83...|   not significant|                null|
|             ERBB2|[{SIDM00527, MDST...|   not significant|                null|
|             ERBB2|[{SIDM00783, HCT1...|   not significant|                null|
|             ERBB2|[{SIDM00150, KM12...|   not significant|                null|
|             ERBB2|[{SIDM01090, RKO,...|   not significant|                null|
|             ERBB2|[{SIDM00680, LS18...|   not significant|                null|
+------------------+--------------------+------------------+--------------------+



In [10]:
(
    spark.read.parquet('/Users/dsuveges/project_data/molecule/')
    .select(col('name'), col('id').alias('MOLECULE_CHEMBL_ID'))
    .show(truncate=False)
)

+---------------------------------+------------------+
|name                             |MOLECULE_CHEMBL_ID|
+---------------------------------+------------------+
|ERLOTINIB HYDROCHLORIDE          |CHEMBL1079742     |
|AMIODARONE HYDROCHLORIDE         |CHEMBL1083993     |
|MIGALASTAT                       |CHEMBL110458      |
|(S)-Fluoxetine                   |CHEMBL1169388     |
|RUCAPARIB                        |CHEMBL1173055     |
|MERETHOXYLLINE PROCAINE          |CHEMBL1200443     |
|SULFISOXAZOLE ACETYL             |CHEMBL1200910     |
|BETAZOLE HYDROCHLORIDE           |CHEMBL1200949     |
|DEXPANTHENOL                     |CHEMBL1200979     |
|CISATRACURIUM                    |CHEMBL1201248     |
|ESTROGENS, ESTERIFIED            |CHEMBL1201468     |
|HYALURONIDASE (HUMAN RECOMBINANT)|CHEMBL1201718     |
|PRASUGREL                        |CHEMBL1201772     |
|CHEMBL1231606                    |CHEMBL1231606     |
|CHEMBL1232381                    |CHEMBL1232381     |
|PF-046915

In [11]:
unichem_molecule_struct_spark_df = (
    unichem_molecule_struct_spark_df
    .filter(
        (f.col('MOLECULE_CHEMBL_ID') != 'CHEMBL692') & 
        (f.col('MOLECULE_CHEMBL_ID') != 'CHEMBL1236970') &
        (f.col('MOLECULE_CHEMBL_ID') != 'CHEMBL457299') & 
        (f.col('MOLECULE_CHEMBL_ID') != 'CHEMBL113178')
    )
    .select('MOLECULE_CHEMBL_ID', 'MOLECULE_PDB_ID', 'STRUCTURE_ID')
)

dropped_molecules = [
    'CHEMBL692', 'CHEMBL1236970', 'CHEMBL457299', 'CHEMBL113178'
]

unichem_molecule_struct_spark_df = (
    unichem_molecule_struct_spark_df
    .filter(~col('MOLECULE_CHEMBL_ID').isIn(dropped_molecules))
    .select('MOLECULE_CHEMBL_ID', 'MOLECULE_PDB_ID', 'STRUCTURE_ID')
)

NameError: name 'unichem_molecule_struct_spark_df' is not defined

In [17]:
unichem = '/Users/dsuveges/Downloads/src1src3.txt'

(
    spark.read.csv(unichem, sep=r'\t', header=True)
    .withColumnRenamed('From src:\'1\'', 'CHEMBL_MOLECULE_ID')
    .withColumnRenamed('To src:\'3\'', 'PDB_COMPOUND_ID')
    .filter(~col('CHEMBL_MOLECULE_ID').isin(['CHEMBL313405']))
    .show()
)

+------------------+---------------+
|CHEMBL_MOLECULE_ID|PDB_COMPOUND_ID|
+------------------+---------------+
|     CHEMBL1399676|            MQN|
|     CHEMBL2347208|            1HL|
|      CHEMBL334167|            SG2|
|     CHEMBL3342582|            8CF|
|      CHEMBL328910|            DOB|
|        CHEMBL1364|            UEG|
|     CHEMBL2219861|            UBU|
|     CHEMBL1087397|            831|
|     CHEMBL4521790|            OUP|
|         CHEMBL614|            PZA|
|     CHEMBL3409982|            BNJ|
|      CHEMBL295698|            KLN|
|     CHEMBL1237000|            ZYN|
|       CHEMBL77785|            NHP|
|     CHEMBL1783734|            2N8|
|      CHEMBL146186|            U8D|
|     CHEMBL4085850|            80L|
|        CHEMBL1427|            HPA|
|     CHEMBL1289601|            LEV|
|     CHEMBL4560579|            P6J|
+------------------+---------------+
only showing top 20 rows



In [None]:
def get_structure(pdb_compound_id: str) -> list:
    """Fetching structure identifiers from PDBkb REST API

    Args:
        pdb_compound_id: string, a single compound identifier
    Returns:
        List of PDB structure identifiers where the compound can be found
    """