In [1]:
import pyspark.sql
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd

global spark
spark = (
    pyspark.sql
     .SparkSession
     .builder
     .config("spark.driver.memory", "10g")
     .getOrCreate()
    )
print('Spark version: ', spark.version)

# Reading data:
evidence_parquet = '/Users/dsuveges/project_data/ot/evidence_update/iter13_b/'
df = spark.read.load(evidence_parquet)
df.show()

Spark version:  3.0.0
+---------------------------------+---------+-------------------+-------------------------------------+----------------------------+--------------+--------------+----------------+------+------------------------------+---------------+----------------+-------------+---------------+---------+----------+---------+-------------------+----------+-------------------------------------+----------------------+--------------+--------------+---------------+----------------------------+-------------+--------------------+--------+------------------+------------+------------------------+----------+---------------------+----------+--------------------------------+--------------------+----------------+-------------------+-----------------------+--------+-------------+-----------------------+-------------+-----------+-----------------+-----------+----------+------------+---------------+-------+---------------+--------------------+--------------------+---------+
|biologicalModelAlle

## Finding out if Uniprot has phenotypes

In [10]:
# Filter for uniprot evidence where the cohortPhenotypes is not null:
uniprot = (
    df
    .filter((col('dataSourceID') == 'uniprot_literature') & (col('cohortPhenotypes').isNotNull()))
)
uniprot.count()


0

In [29]:
import pandas as pd


her3 = pd.read_json('/Users/dsuveges/project_data/ot/evidence_input/20.11/uniprot/her3_evidence.json', lines=True)

pocok = (
    her3.assign(
        sourceID = her3.sourceID,
        target = her3.unique_association_fields.apply(lambda x: x['target'].split('/')[-1]),
        disease = her3.unique_association_fields.apply(lambda x: x['disease_uri'].split('/')[-1]),
        disease_acronym = her3.unique_association_fields.apply(lambda x: x['disease_acronym']),
        target_activity = her3.target.apply(lambda x: x['activity'].split('/')[-1]),
        disease_name = her3.disease.apply(lambda x: x['name']),
        variant = her3.variant.apply(lambda x: x['id'].split('/')[-1] if 'id' in x else None)
    )
    .drop(['literature','evidence','variant','unique_association_fields', 'access_level', 'validated_against_schema_version'], axis=1)
)

pocok

TypeError: argument of type 'float' is not iterable

In [12]:
notnull = []

for column in df.columns:
    sources = (
        df
        .filter(f'{column} is not null')
        .select('sourceId')
        .distinct()
        .toPandas()
    )
    
    notnull.append({
        'column': column,
        'sources': ','.join(sources.sourceId.tolist())
    })
    
sources_df = pd.DataFrame(notnull)
sources_df

Unnamed: 0,column,sources
0,biologicalModelAllelicComposition,phenodigm
1,variantId,"phewas_catalog,ot_genetics_portal"
2,diseaseFromSourceId,"europepmc,crispr,genomics_england,chembl,slape..."
3,diseaseModelAssociatedModelPhenotypes,phenodigm
4,variantAminoacidDescriptions,reactome
5,clinicalStatus,chembl
6,mutatedSamples,"intogen,cancer_gene_census"
7,cohortPhenotypes,genomics_england
8,drugId,chembl
9,variantFunctionalConsequenceId,"phewas_catalog,eva,ot_genetics_portal"


In [190]:
sources_df.sort_values('column', inplace=True)
sources_df.to_csv('/Users/dsuveges/project/random_notebooks/issue-1249_prototyping_new_json_schema/fields_sources.tsv', index=False, sep='\t')


In [179]:
mutated_samples = (
    df
    .filter('mutatedSamples is not null')
    .select(explode("mutatedSamples").alias("exploded"))
    .select("exploded.*")
    .toPandas()
)
mutated_samples.head()


Unnamed: 0,functionalConsequenceId,numberMutatedSamples,numberSamplesTested,numberSamplesWithMutationType
0,SO_0001605,1240,9180,1.0
1,SO_0001059,1240,9180,44.0
2,SO_0001589,1240,9180,1.0
3,SO_0001583,1240,9180,1194.0
4,SO_0001059,6,626,6.0


In [177]:
mutated_samples

Unnamed: 0,mutatedSamples
0,"[(SO_0001605, 1240, 9180, 1), (SO_0001059, 124..."
1,"[(SO_0001059, 6, 626, 6), (SO_0001583, 6, 626,..."
2,"[(SO_0001587, 22, 802, 1), (SO_0001059, 22, 80..."
3,"[(SO_0001589, 3, 455, 1), (SO_0001583, 3, 455,..."
4,"[(SO_0001059, 14, 370, 1), (SO_0001583, 14, 37..."
...,...
65003,"[(None, 3, 121, None)]"
65004,"[(None, 13, 58, None)]"
65005,"[(None, 2, 148, None)]"
65006,"[(None, 3, 15, None)]"


In [188]:
df.select(sorted(df.columns)).printSchema()

root
 |-- allelicRequirements: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- biologicalModelAllelicComposition: string (nullable = true)
 |-- biologicalModelGeneticBackground: string (nullable = true)
 |-- clinicalPhase: long (nullable = true)
 |-- clinicalSignificances: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- clinicalStatus: string (nullable = true)
 |-- clinicalUrls: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- niceName: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |-- cohortDescription: string (nullable = true)
 |-- cohortId: string (nullable = true)
 |-- cohortPhenotypes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- cohortShortName: string (nullable = true)
 |-- confidence: string (nullable = true)
 |-- confidenceIntervalLower: double (nullable = true)
 |-- confidenceIntervalUpper: double (nullable = true)
 |-- contr

In [196]:
(
    df
    .filter('variantAminoacidDescriptions is not null')
    .select('variantAminoacidDescriptions','sourceId')
    .show()
)

+----------------------------+--------+
|variantAminoacidDescriptions|sourceId|
+----------------------------+--------+
|        [L-leucine 389 re...|reactome|
|        [L-glutamic acid ...|reactome|
|                          []|reactome|
|        [L-isoleucine 161...|reactome|
|                          []|reactome|
|        [L-threonine 680 ...|reactome|
|        [L-tyrosine 207 r...|reactome|
|        [Insertion of res...|reactome|
|                          []|reactome|
|                          []|reactome|
|                          []|reactome|
|                          []|reactome|
|        [glycine 12 repla...|reactome|
|                          []|reactome|
|                          []|reactome|
|        [L-valine 1576 re...|reactome|
|                          []|reactome|
|        [L-asparagine 822...|reactome|
|        [L-proline 252 re...|reactome|
|        [L-glutamine 61 r...|reactome|
+----------------------------+--------+
only showing top 20 rows



In [29]:
x = (df
    .filter(df.contrast.isNotNull())
    .select(col('contrast'))
    .limit(19).toPandas())


x.contrast.tolist()

["'Crohn's disease, undetermined' vs 'non inflammatory bowel disease control'",
 "'Myotonic dystrophy' vs 'normal'",
 "'rheumatoid arthritis' vs 'normal'",
 "'primary pancreatic ductal adenocarcinoma' vs 'normal' in 'pancreas'",
 "'colon cancer' vs 'normal' in 'colon; Fresh-frozen tissue'",
 "'Ataxia-telangiectasia' vs 'normal'",
 "'tumour tissue' vs 'adjacent normal tissue'",
 "'glioblastoma' vs 'normal'",
 "'esophageal adenocarcinoma' vs 'normal'",
 "'meningeal tuberculosis' vs 'normal'",
 "'bacteriemia; Staphylococcus aureus' vs 'normal; none'",
 "'esophageal adenocarcinoma' vs 'normal'",
 "'non-enhancing margin; glioma' vs 'normal'",
 "'monocyte; meningococcal sepsis' at '0 hour' vs 'monocyte; normal' at '0 hour'",
 "'sepsis' vs 'normal' in 'whole blood'",
 "'tumor tissue' vs 'non-malignant tissue'",
 "'Alzheimers disease' vs 'normal' in 'entorhinal cortex'",
 "'Crohn's disease, macroscopic ileal inflammation with deep ulcer' vs 'non inflammatory bowel disease control'",
 "'colon c

In [5]:
xf = (
    df
     .filter(df.datasourceId=='crispr')
     .toPandas()
)

xf.head()

Unnamed: 0,biologicalModelAllelicComposition,variantId,diseaseFromSourceId,diseaseModelAssociatedModelPhenotypes,variantAminoacidDescriptions,clinicalStatus,mutatedSamples,cohortPhenotypes,drugId,variantFunctionalConsequenceId,...,cohortDescription,variantRsId,studyCases,clinicalUrls,diseaseId,studyId,studySampleSize,id,score,sourceId
0,,,EFO_0005922,,,,,,,,...,,,,,EFO_0005922,,,0c3ae08069a9f02e7c9bdf01b53ff1ef16eb2507,0.42625,crispr
1,,,EFO_0000305,,,,,,,,...,,,,,EFO_0000305,,,106f631b10cad601fd69d041b1ea027d0ab0f2fd,0.464688,crispr
2,,,EFO_0001378,,,,,,,,...,,,,,EFO_0001378,,,1a85e8f3af29b61c323d52d503a25772bca9b375,0.557813,crispr
3,,,EFO_0000222,,,,,,,,...,,,,,EFO_0000222,,,339addf13f461e55651ce4c566ef3b073dfff18a,0.448889,crispr
4,,,EFO_0001378,,,,,,,,...,,,,,EFO_0001378,,,38866d61fb0c0877d91b06a8b31e60547818a4cf,0.514063,crispr


In [14]:

xf.drop([col for col in xf.columns if not xf[col].any()], axis=1, inplace=True)
xf.iloc[1].to_dict()

{'diseaseFromSourceId': 'EFO_0000305',
 'resourceScore': 0.464688,
 'literature': ['30971826'],
 'targetId': 'ENSG00000138107',
 'targetFromSourceId': 'ENSG00000138107',
 'datasourceId': 'crispr',
 'datatypeId': 'affected_pathway',
 'diseaseFromSource': 'Breast Carcinoma',
 'diseaseCellLines': ['AU565',
  'COLO-824',
  'EVSA-T',
  'HCC1143',
  'HCC1187',
  'HCC1806',
  'HCC1937',
  'HCC1954',
  'HCC38',
  'Hs-578-T',
  'JIMT-1',
  'MCF7',
  'MDA-MB-361',
  'MDA-MB-415',
  'MDA-MB-436',
  'MDA-MB-453',
  'MFM-223',
  'OCUB-M',
  'UACC-893',
  'CAL-51',
  'HCC1395',
  'HCC70',
  'MDA-MB-231',
  'T47D',
  'MDA-MB-468'],
 'diseaseId': 'EFO_0000305',
 'id': '106f631b10cad601fd69d041b1ea027d0ab0f2fd',
 'score': 0.464688,
 'sourceId': 'crispr'}

In [10]:
xf.columns

Index(['biologicalModelAllelicComposition', 'variantId', 'diseaseFromSourceId',
       'diseaseModelAssociatedModelPhenotypes', 'variantAminoacidDescriptions',
       'clinicalStatus', 'mutatedSamples', 'cohortPhenotypes', 'drugId',
       'variantFunctionalConsequenceId', 'publicationYear', 'targetModulation',
       'resourceScore', 'cohortShortName', 'pathwayId', 'literature',
       'oddsRatio', 'log2FoldChangeValue', 'confidence',
       'diseaseModelAssociatedHumanPhenotypes', 'publicationFirstAuthor',
       'pValueExponent', 'pValueMantissa', 'targetId',
       'log2FoldChangePercentileRank', 'clinicalPhase', 'textMiningSentences',
       'cohortId', 'targetFromSourceId', 'datasourceId',
       'significantDriverMethods', 'reactionId', 'clinicalSignificances',
       'datatypeId', 'biologicalModelGeneticBackground', 'diseaseFromSource',
       'diseaseCellLines', 'allelicRequirements', 'confidenceIntervalLower',
       'contrast', 'studyOverview', 'confidenceIntervalUpper', 'ta