In [100]:
from functools import reduce
import pandas as pd
from pyspark.sql.functions import col, udf, struct, lit, split, regexp_replace, create_map, when
from pyspark.sql.types import FloatType, ArrayType, StructType, StructField, StringType
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from itertools import chain
from pyspark.sql.dataframe import DataFrame

# establish spark connection
sparkConf = (
    SparkConf()
    .set('spark.driver.memory', '15g')
    .set('spark.executor.memory', '15g')
    .set('spark.driver.maxResultSize', '0')
)
spark = (
    SparkSession.builder
    .config(conf=sparkConf)
    .master('local[*]')
    .getOrCreate()
)

BIOMARKERMAPS = {
    'PAN': {
        "direct_mapping": {
            "CO": {
                "name": "PAN-CO",
                "description": "Pan-colorectal carcinoma"
            }
        }
    },
    'MS_status': {
        'direct_mapping': {
            "MSI": {
                "name": "MSI",
                "description": "Microsatellite instable"
            },
            "MSS": {
                "name": "MSS",
                "description": "Microsatellite stable"
            }
        }
    },
    'CRIS_subtype': {
        "direct_mapping": {
            "A": {
                "name": "CRIS-A",
                "description": "mucinous, glycolytic, enriched for microsatellite instability or KRAS mutations."
            },
            "B": {
                "name": "CRIS-B",
                "description": "TGF-β pathway activity, epithelial-mesenchymal transition, poor prognosis."
            },
            "C": {
                "name": "CRIS-C",
                "description": "elevated EGFR signalling, sensitivity to EGFR inhibitors."
            },
            "D": {
                "name": "CRIS-D",
                "description": "WNT activation, IGF2 gene overexpression and amplification."
            },
            "E": {
                "name": "CRIS-E",
                "description": "Paneth cell-like phenotype, TP53 mutations."
            },
            "?": {
                "name": "CRIS-?",
                "description": "CRIS subtype not determined."
            }
        }
    },
    'KRAS_status': {
        'description': 'KRAS mutation status: ',
        'name': 'KRAS-',
    },
    'TP53_status': {
        'description': 'TP53 mutation status: ',
        'name': 'TP53-',
    },
    'APC_status': {
        'description': 'APC mutation status: ',
        'name': 'APC-',
    }
}

@udf(StructType([
    StructField("name", StringType(), False),
    StructField("description", StringType(), False)
]))
def get_biomarker(columnName: str, biomarker: str) -> dict:
    '''This function returns with a struct with the biomarker name and description'''

    # If the biomarker has a direct mapping:
    if 'direct_mapping' in BIOMARKERMAPS[columnName]:
        try:
            return BIOMARKERMAPS[columnName]['direct_mapping'][biomarker]
        except KeyError:
            logging.warning(f'Could not find direct mapping for {columnName}:{biomarker}')
            return None

    # If the value needs to be parsed:
    if biomarker == 'wt':
        return {
            'name': BIOMARKERMAPS[columnName]['name'] + biomarker,
            'description': BIOMARKERMAPS[columnName]['description'] + 'wild type'
        }
    elif biomarker == 'mut':
        return {
            'name': BIOMARKERMAPS[columnName]['name'] + biomarker,
            'description': BIOMARKERMAPS[columnName]['description'] + 'mutant'
        }
    else:
        logging.warning(
            f'Could not find direct mapping for {columnName}:{biomarker}')
        return None



In [97]:
expected_hypotheis_file = '/Users/dsuveges/repositories/evidence_datasource_parsers/partner_preview_scripts/VL_Hypothesis_COgenes_expected.txt'
observed_hypotheis_file = '/Users/dsuveges/repositories/evidence_datasource_parsers/partner_preview_scripts/VL_Hypothesis_COgenes_observed.txt'

expected_df = ''
observed_df = ''
def read_hypothesis_data(file: str, call: str) -> DataFrame:
    
    hypothesis_df = (
        spark.read.csv(file, sep='\t', header=True)
        .withColumnRenamed('Gene', 'gene')
    )
    
    hypothesis_columns = hypothesis_df.columns[1:]

    unpivot_expression = f'''stack({len(hypothesis_columns)}, {", ".join([f"'{x}', `{x}`" for x in hypothesis_columns])} ) as (hypothesis, {call})'''

    return (
        hypothesis_df
        .select('Gene', expr(unpivot_expression))
        .withColumn(call, col(call).cast(BooleanType()))
        .persist()
    )

expected_df = read_hypothesis_data(expected_hypotheis_file, 'expected')
observed_df = read_hypothesis_data(observed_hypotheis_file, 'observed')

observed_df.show()


+-------+---------------+--------+
|   Gene|     hypothesis|observed|
+-------+---------------+--------+
|ARHGEF7| APC_status-mut|   false|
|ARHGEF7|  APC_status-wt|   false|
|ARHGEF7| CRIS_subtype-A|   false|
|ARHGEF7| CRIS_subtype-B|   false|
|ARHGEF7| CRIS_subtype-D|   false|
|ARHGEF7| CRIS_subtype-?|    true|
|ARHGEF7|KRAS_status-mut|   false|
|ARHGEF7| KRAS_status-wt|   false|
|ARHGEF7|  MS_status-MSI|   false|
|ARHGEF7|  MS_status-MSS|   false|
|ARHGEF7|TP53_status-mut|   false|
|ARHGEF7| TP53_status-wt|   false|
|ARHGEF7|         PAN-CO|   false|
|  BRCA2| APC_status-mut|   false|
|  BRCA2|  APC_status-wt|   false|
|  BRCA2| CRIS_subtype-A|    true|
|  BRCA2| CRIS_subtype-B|   false|
|  BRCA2| CRIS_subtype-D|   false|
|  BRCA2| CRIS_subtype-?|   false|
|  BRCA2|KRAS_status-mut|   false|
+-------+---------------+--------+
only showing top 20 rows



In [110]:
(
    expected_df
    # Joining expected vs observed hypothesis tables:
    .join(observed_df, on=['gene', 'hypothesis'], how='inner')
    
    # Filter hypotheses where at least one was True:
    .filter(col('expected') | col('observed'))
    
    # 
    .withColumn('hypothesis_type', element_at(split(col('hypothesis'), '-'), 1))
    .withColumn('hypothesis_call', element_at(split(col('hypothesis'), '-'), 2))
    
#     # 
    .withColumn('hypothesis', get_biomarker(col('hypothesis_type'), col('hypothesis_call')))
    .withColumn(
        'status',
        when(col('expected') & col('observed'), 'observed and expected')
        .when(col('expected'), 'expected only')
        .when(col('observed'), 'observed only')
        .otherwise('not expected and not observed')           
    )
    .withColumn('hypothesis', struct('hypothesis.*', 'status'))
    .groupBy('gene')
    .agg(
        collect_set('hypothesis').alias('hypotheses')
    )
    .printSchema()
)

root
 |-- gene: string (nullable = true)
 |-- hypotheses: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- name: string (nullable = true)
 |    |    |-- description: string (nullable = true)
 |    |    |-- status: string (nullable = false)



In [50]:
@udf(StructType([
    StructField('name', StringType(), True), 
    StructField('description', StringType(), True)
]))
def parse_hypothesis(hypothesis: str) -> dict:
    """The hypothesis field is needed to be parsed to get type and actual status"""
    
    CRIS_mapping =  {
        "A": {
            "name": "CRIS-A",
            "description": "mucinous, glycolytic, enriched for microsatellite instability or KRAS mutations"
        },
        "B": {
            "name": "CRIS-B",
            "description": "TGF-β pathway activity, epithelial-mesenchymal transition, poor prognosis"
        },
        "C": {
            "name": "CRIS-C",
            "description": "elevated EGFR signalling, sensitivity to EGFR inhibitors"
        },
        "D": {
            "name": "CRIS-D",
            "description": "WNT activation, IGF2 gene overexpression and amplification"
        },
        "E": {
            "name": "CRIS-E",
            "description": "Paneth cell-like phenotype, TP53 mutations."
        },
        "?": {
            "name": "CRIS-E",
            "description": "CRIS subtype not known"
        }
    }

    if hypothesis.startswith('MS'):
        if hypothesis == 'MSS':
            return {
                'name': 'MSS',
                'description': 'Microsatellite stable'
            }
        elif hypothesis == 'MSI':
            return {
                'name': 'MSI',
                'description': 'Microsatellite instable'
            }
    
    elif hypothesis.startswith('CRIS'):
        crisType = hypothesis.split('_')[1]
        return CRIS_mapping[crisType]
    
    elif hypothesis.endswith('wt') or hypothesis.endswith('mut'):
        (gene, status_short) = hypothesis.split('_')
        status = 'wild-type' if status_short == 'wt' else 'mutant'
        return {
            'description': f'{gene} mutation status: {status}',
            'name': f'{gene}-{status_short}',
        }
    elif hypothesis.startswith('PAN'):
        source = hypothesis.split('_')[1]
        if source == 'CO':
            disease = 'colorectal carcinoma'
        else:
            disease = 'undefined disease'
        return {
            'name': hypothesis,
            'description': f'Pan {disease}'
        }
    


In [112]:
(
    spark.createDataFrame([
        {
            "cica": 12,
            "pocok-1": 32,
            "pocok2": 42,
            "nested": {
                "a": 'DOB',
                "b": 'FAD'
            }
        },
        {
            "cica": 112,
            "pocok-1": 232,
            "pocok2": 422,
            "nested": {
                "a": 'DOB',
                "b": 'FAD'
            }
        },
        {
            "cica": 162,
            "pocok-1": 332,
            "pocok2": 482,
            "nested": {
                "a": 'NAD',
                "b": 'FAD'
            }
        }

    ])
    .filter(col('nested.a') == 'DOB')
    .show()
)

+----+--------------------+-------+------+
|cica|              nested|pocok-1|pocok2|
+----+--------------------+-------+------+
|  12|{a -> DOB, b -> FAD}|     32|    42|
| 112|{a -> DOB, b -> FAD}|    232|   422|
+----+--------------------+-------+------+



In [113]:
(
    spark.read.json('/Users/dsuveges/repositories/evidence_datasource_parsers/ot_crispr_2022-02-14.json.gz')
    .printSchema()
)

root
 |-- contrast: string (nullable = true)
 |-- crisprScreenLibrary: string (nullable = true)
 |-- datasourceId: string (nullable = true)
 |-- datatypeId: string (nullable = true)
 |-- diseaseCellLines: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |-- diseaseFromSourceMappedId: string (nullable = true)
 |-- geneticBackground: string (nullable = true)
 |-- log2FoldChangeValue: double (nullable = true)
 |-- projectDescription: string (nullable = true)
 |-- projectId: string (nullable = true)
 |-- resourceScore: double (nullable = true)
 |-- statisticalTestTail: string (nullable = true)
 |-- studyId: string (nullable = true)
 |-- studyOverview: string (nullable = true)
 |-- targetFromSourceId: string (nullable = true)

