In [27]:
import pyspark.sql
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext
import re

def pheno_parser(x):
    match = re.match('(\D+:\d+) (.+)', x)
    try:
        return {
            'mp_id': match[1] if match[1] else None,
            'mp_term':  match[2] if match[2] else None
        }
    except:
        return {
            'mp_id': None,
            'mp_term':  None
        }

parse_phenotypes = udf(pheno_parser, MapType(StringType(), StringType()))


global spark

# SparkContext.setSystemProperty('spark.executor.memory', '20g')

spark = (pyspark.sql.SparkSession
  .builder
  .appName("phenodigm_parser")
 .config("spark.executor.memory", '10g') \
 .config("spark.driver.memory", '10g') \
  .getOrCreate()
)

#   .config("spark.driver.bindAddress", "localhost")


print('Spark version: ', spark.version)


# solr data folder :
solr_data_folder = '/Users/dsuveges/project_data/phenodigm_solr_dump'
solr_data_folder = '/home/dsuveges/project/test_data'

"""
The steps below 
 * read the gene set
 * filter for human genes (rows with hgnc_id)
 * join with gene-gene linking table

Resulting table has 
 * human HGNC gene IDs
 * human HGNC gene symbol
 * mouse MGI gene id
"""

human_genes = (
    spark.read.json(f'{solr_data_folder}/type.gene')
    .select('hgnc_gene_id', 'hgnc_gene_symbol')
    .filter(col('hgnc_gene_id').isNotNull())
)

genes_table = (
    spark.read.json(f'{solr_data_folder}/type.gene_gene')
    .select('hgnc_gene_id', 'gene_id')
    .join(human_genes, on='hgnc_gene_id', how='inner')
)

"""
Ontology table contains the mapping between human an mouse
phenotypes. Only those mouse phenotypes included that have human correspondent

The mouse phenotype term is not included - that value comes from the models table
"""
ontolgy_table = (
    spark.read.json(f'{solr_data_folder}/type.ontology_ontology')
    .select('hp_id','hp_term','mp_id')
)

print(f"Number of human phenotypes: {ontolgy_table.select('hp_id').distinct().count()}")
print(f"Number of mouse phenotypes: {ontolgy_table.select('mp_id').distinct().count()}")
print(f"Number of human to mouse phenotype mappings: {ontolgy_table.count()}")


"""
The step below
 * reads mouse model table.
 * joins with genes table

Resulting table has
 * human HGNC gene IDs
 * human HGNC gene symbol
 * mouse MGI gene id
 * mouse model identifier
 * mouse model phenotype list
"""
mouse_model_table = (
    spark.read.json(f'{solr_data_folder}/type.mouse_model')
    .select('model_id','model_phenotypes', 'marker_id')
    .withColumnRenamed('marker_id', 'gene_id')
    .withColumn('model_phenotype', explode(col('model_phenotypes')))
    .withColumn('parsed_phenotype', parse_phenotypes(col('model_phenotype')))
    .drop('model_phenotypes')
    .select('model_id', 'gene_id', 'parsed_phenotype.mp_id', 'parsed_phenotype.mp_term')
    .join(ontolgy_table, on='mp_id', how='left')
    .join(genes_table, on='gene_id', how='inner')
    .groupby('model_id','gene_id', 'hgnc_gene_id', 'hgnc_gene_symbol')
    .agg(
        collect_set(struct(           
                col("mp_id").alias('id'), 
                col('mp_term').alias('label')
            )).alias('diseaseModelAssociatedModelPhenotypes'),
        collect_set(struct(           
                col("hp_id").alias('id'), 
                col('hp_term').alias('label')
            )).alias('diseaseModelAssociatedHumanPhenotypes')        
    )
)
mouse_model_table.show()

# """
# The step below:
#  * Opening disease model summary - table with disease terms for every model
#  * Joined with mouse model table.

# Resulting table adds the following columns:
#  * disease_id
#  * disease_model_avg_raw
#  * disease_model_max_norm
#  * disease_model_max_raw
#  * disease_term
#  * marker_id
#  * marker_num_models
#  * model_description
#  * model_genetic_background
#  * model_id
# """

# # For now,I'm not sure if it make sense to do the aggregation.
# # Aggregation was required because the data was stored in dictionaries
disease_model_table = (
    spark.read.json(f'{solr_data_folder}/type.disease_model_summary/')
    .drop(*['association_curated', 'marker_locus', 'marker_symbol','model_source','type', 'disease_model_avg_norm'])
    .join(mouse_model_table, on='model_id', how='inner')
)


# (
#     disease_model_table
#     .select('association_curated', 'disease_id', 'disease_term', 'marker_id', 'marker_num_models','model_description', 'model_genetic_background', 'model_id')
#     .where(col('model_id') == 'MGI:2678495')
#     .show()
# )



# f = (
#     disease_model_table
#     .join(mouse_model_table, on='model_id', how='inner')
# )

# print(models_diseases_joined.count())
# print(models_diseases_joined.show())
# disease_model_table.show()


# mouse_model_table.show()

Spark version:  3.0.0
Number of human phenotypes: 12841
Number of mouse phenotypes: 10536
Number of human to mouse phenotype mappings: 477198


Py4JJavaError: An error occurred while calling o1719.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 250.0 failed 1 times, most recent failure: Lost task 0.0 in stage 250.0 (TID 7909, c02zq14flvdn, executor driver): java.lang.RuntimeException: Duplicate map key MP:0002498 was found, please check the input data. If you want to remove the duplicated keys, you can set spark.sql.mapKeyDedupPolicy to LAST_WIN so that the key inserted at last takes precedence.
	at org.apache.spark.sql.catalyst.util.ArrayBasedMapBuilder.put(ArrayBasedMapBuilder.scala:72)
	at org.apache.spark.sql.catalyst.util.ArrayBasedMapBuilder.put(ArrayBasedMapBuilder.scala:87)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateResultProjection$5(AggregationIterator.scala:259)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.next(ObjectAggregationIterator.scala:86)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.next(ObjectAggregationIterator.scala:33)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage9.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:340)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2023)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1972)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1971)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1971)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:950)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:950)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:950)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2203)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2152)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2141)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:752)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2133)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:467)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:420)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3625)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2695)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3616)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3614)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2695)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2902)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:300)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:337)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.RuntimeException: Duplicate map key MP:0002498 was found, please check the input data. If you want to remove the duplicated keys, you can set spark.sql.mapKeyDedupPolicy to LAST_WIN so that the key inserted at last takes precedence.
	at org.apache.spark.sql.catalyst.util.ArrayBasedMapBuilder.put(ArrayBasedMapBuilder.scala:72)
	at org.apache.spark.sql.catalyst.util.ArrayBasedMapBuilder.put(ArrayBasedMapBuilder.scala:87)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateResultProjection$5(AggregationIterator.scala:259)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.next(ObjectAggregationIterator.scala:86)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.next(ObjectAggregationIterator.scala:33)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage9.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:340)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [5]:

ontology_table = (
    spark.read.json(f'{solr_data_folder}/type.ontology/')
)

print(ontology_table.count())
print(ontology_table.select('phenotype_id').distinct().count())


29310
29310


In [19]:
"""
Reading disease
"""
disease_table = (
    spark.read.json(f'{solr_data_folder}/type.disease/')
    .select('disease_id','disease_phenotypes','disease_term')
)

4797

In [1]:
disease_gene_table = (
    spark.read.json(f'{solr_data_folder}/type.disease_gene_summary/')
).show()


disease_table = (
    spark.read.json(f'{solr_data_folder}/type.disease')
)

disease_table.show()

ontolgy_table = (
    spark.read.json(f'{solr_data_folder}/type.ontology')
    .select('phenotype_id','phenotype_term')
)

ontolgy_table.show()




ontolgy_ontology_table.show()

6

In [None]:
(
    disease_gene_table
    .filter(col('disease_id') == 'OMIM:101600')
    .select('hgnc_gene_symbol')
    .distinct()
    .show()
)

(
    disease_model_table
    .filter(col('disease_id') == 'OMIM:101600')
    .select('hgnc_gene_symbol')
    .distinct()
    .sort('hgnc_gene_symbol')
    .show()
)



In [7]:
"""
Ontology table contains the mapping between human an mouse
phenotypes.
"""
ontolgy_table = (
    spark.read.json(f'{solr_data_folder}/type.ontology_ontology')
    .select('hp_id','hp_term','mp_id','mp_term')
)

print(f"Number of human phenotypes: {ontolgy_table.select('hp_id').distinct().count()}")
print(f"Number of mouse phenotypes: {ontolgy_table.select('mp_id').distinct().count()}")
print(f"Number of human to mouse phenotype mappings: {ontolgy_table.count()}")



Number of human phenotypes: 12841
Number of mouse phenotypes: 10536
Number of human to mouse phenotype mappings: 477198


In [8]:
ontolgy_table.show()




+----------+--------------------+----------+--------------------+
|     hp_id|             hp_term|     mp_id|             mp_term|
+----------+--------------------+----------+--------------------+
|HP:0006704|Abnormal coronary...|MP:0002845|abnormal aortic w...|
|HP:0006704|Abnormal coronary...|MP:0002844|  aortic hypertrophy|
|HP:0006704|Abnormal coronary...|MP:0002725|abnormal vein mor...|
|HP:0006704|Abnormal coronary...|MP:0005339|increased suscept...|
|HP:0006704|Abnormal coronary...|MP:0005338|atherosclerotic l...|
|HP:0006704|Abnormal coronary...|MP:0012255|abnormal intersom...|
|HP:0006705|Abnormal atrioven...|MP:0011680|tricuspid valve h...|
|HP:0006705|Abnormal atrioven...|MP:0010484|bicuspid aortic v...|
|HP:0006705|Abnormal atrioven...|MP:0010596|unicuspid aortic ...|
|HP:0006705|Abnormal atrioven...|MP:0010628|patent tricuspid ...|
|HP:0006705|Abnormal atrioven...|MP:0010629|thick tricuspid v...|
|HP:0006705|Abnormal atrioven...|MP:0010627|enlarged tricuspi...|
|HP:000670

In [9]:
def udf_test(n):
    return (n / 2, n % 2) if n and n != 0.0 else (float('nan'), float('nan'))

2

In [2]:
phenotypes = ['MP:0002825 abnormal notochord morphology',
 'MP:0005221 abnormal rostral-caudal axis patterning',
 'MP:0003091 abnormal cell migration',
 'MP:0001680 abnormal mesoderm development',
 'MP:0001675 abnormal ectoderm development',
 'MP:0011260 abnormal head mesenchyme morphology',
 'MP:0012104 small amniotic cavity',
 'MP:0004787 abnormal dorsal aorta morphology',
 'MP:0001698 decreased embryo size',
 'MP:0000272 abnormal aorta morphology',
 'MP:0004180 failure of initiation of embryo turning',
 'MP:0010656 thick myocardium',
 'MP:0001614 abnormal blood vessel morphology',
 'MP:0002128 abnormal blood circulation',
 'MP:0002086 abnormal extraembryonic tissue morphology',
 'MP:0003974 abnormal endocardium morphology',
 'MP:0004261 abnormal embryonic neuroepithelium morphology',
 'MP:0005029 abnormal amnion morphology',
 'MP:0000267 abnormal heart development',
 'MP:0010547 abnormal mesocardium morphology',
 'MP:0009657 failure of chorioallantoic fusion',
 'MP:0012131 small visceral yolk sac',
 'MP:0001726 abnormal allantois morphology',
 'MP:0000474 abnormal foregut morphology',
 'MP:0002085 abnormal embryonic tissue morphology',
 'MP:0001914 hemorrhage',
 'MP:0002084 abnormal developmental patterning',
 'MP:0010664 abnormal vitelline artery morphology',
 'MP:0011257 abnormal head fold morphology',
 'MP:0011098 embryonic lethality during organogenesis, complete penetrance',
 'MP:0001688 abnormal somite development',
 'MP:0011201 abnormal visceral yolk sac cavity morphology',
 'MP:0000358 abnormal cell morphology']

import re 

x = phenotypes[2]
x


'MP:0003091 abnormal cell migration'

In [9]:
b = 

In [12]:
parse_phenotypes = udf(
    lambda x: {'id': re.match('(\D+:\d+) (.+)', x)[1], 'name': re.match('(MP:\d+) (.+)', x)[2]},
)

def pheno_parser(x):
    match = re.match('(\D+:\d+) (.+)', x)
    try:
        return {
            'mp_id': match[1] if match[1] else None,
            'mp_term':  match[2] if match[2] else None,
        }
    except:
        print(x)

parse_phenotypes = udf(pheno_parser, StructType())





In [47]:
mouse_model_table = (
    spark.read.json(f'{solr_data_folder}/type.mouse_model')
    .limit(1000)
    .select('model_id','model_phenotypes', 'marker_id')
    .withColumnRenamed('marker_id', 'gene_id')
    .join(genes_table, on='gene_id', how='inner')
    .withColumn('model_phenotype', explode(col('model_phenotypes')))
    .withColumn('mp_id', split(col('model_phenotype'),' ').getItem(0))
    .join(ontolgy_ontology_table, on='mp_id', how='left')
)
pdf = mouse_model_table.toPandas()
pdf.head()

mouse_model_table = (
    spark.read.json(f'{solr_data_folder}/type.mouse_model')
    .select('model_id','model_phenotypes', 'marker_id')
    .withColumnRenamed('marker_id', 'gene_id')
    .withColumn('model_phenotype', explode(col('model_phenotypes')))
    .withColumn('parsed_phenotype', parse_phenotypes(col('model_phenotype')))
    .select('model_id', 'gene_id', 'parsed_phenotype.mp_id', 'parsed_phenotype.mp_term')
)
mouse_model_table.show()

pdf = mouse_model_table.toPandas()
pdf.head()




ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/dsuveges/opt/anaconda3/envs/jupyter/lib/python3.6/site-packages/py4j/java_gateway.py", line 1200, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/dsuveges/opt/anaconda3/envs/jupyter/lib/python3.6/socket.py", line 586, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [Errno 54] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/dsuveges/opt/anaconda3/envs/jupyter/lib/python3.6/site-packages/py4j/java_gateway.py", line 1033, in send_command
    response = connection.send_command(command)
  File "/Users/dsuveges/opt/anaconda3/envs/jupyter/lib/python3.6/site-packages/py4j/java_gateway.py", line 1212, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving


Py4JError: An error occurred while calling o23.read

In [18]:
match = re.match('(\D+:\d+) (.+)', x)

In [30]:

ontolgy_table = (
    spark.read.json(f'{solr_data_folder}/type.ontology')
    .select('phenotype_id','phenotype_term')
)

ontolgy_table.show()


+------------+--------------------+
|phenotype_id|      phenotype_term|
+------------+--------------------+
|  HP:0000001|                 All|
|  HP:0000002|Abnormality of bo...|
|  HP:0000003|Multicystic kidne...|
|  HP:0000005| Mode of inheritance|
|  HP:0000006|Autosomal dominan...|
|  HP:0000007|Autosomal recessi...|
|  HP:0000008|Abnormal morpholo...|
|  HP:0000009|Functional abnorm...|
|  HP:0000010|Recurrent urinary...|
|  HP:0000011|  Neurogenic bladder|
|  HP:0000012|     Urinary urgency|
|  HP:0000013|Hypoplasia of the...|
|  HP:0000014|Abnormality of th...|
|  HP:0000015|Bladder diverticulum|
|  HP:0000016|   Urinary retention|
|  HP:0000017|            Nocturia|
|  HP:0000019|   Urinary hesitancy|
|  HP:0000020|Urinary incontinence|
|  HP:0000021|          Megacystis|
|  HP:0000022|Abnormality of ma...|
+------------+--------------------+
only showing top 20 rows



In [32]:
ontolgy_ontology_table = (
    spark.read.json(f'{solr_data_folder}/type.ontology_ontology')
)

ontolgy_ontology_table.show()

+----------+--------------------+----------+--------------------+-----------------+
|     hp_id|             hp_term|     mp_id|             mp_term|             type|
+----------+--------------------+----------+--------------------+-----------------+
|HP:0006704|Abnormal coronary...|MP:0002845|abnormal aortic w...|ontology_ontology|
|HP:0006704|Abnormal coronary...|MP:0002844|  aortic hypertrophy|ontology_ontology|
|HP:0006704|Abnormal coronary...|MP:0002725|abnormal vein mor...|ontology_ontology|
|HP:0006704|Abnormal coronary...|MP:0005339|increased suscept...|ontology_ontology|
|HP:0006704|Abnormal coronary...|MP:0005338|atherosclerotic l...|ontology_ontology|
|HP:0006704|Abnormal coronary...|MP:0012255|abnormal intersom...|ontology_ontology|
|HP:0006705|Abnormal atrioven...|MP:0011680|tricuspid valve h...|ontology_ontology|
|HP:0006705|Abnormal atrioven...|MP:0010484|bicuspid aortic v...|ontology_ontology|
|HP:0006705|Abnormal atrioven...|MP:0010596|unicuspid aortic ...|ontology_on

In [41]:
df = spark.createDataFrame([[['A', 'B', 'C', 'D']], [['E', 'F']]], ['split'])
df.show()
# +------------+
# |       split|
# +------------+
# |[A, B, C, D]|
# |      [E, F]|
# +------------+

# import pyspark.sql.functions as F
df.withColumn('lastItem', col('split').getItem(0)).show()
# +------------+--------+
# |       split|lastItem|
# +------------+--------+
# |[A, B, C, D]|       D|
# |      [E, F]|       F|
# +------------+--------+


+------------+
|       split|
+------------+
|[A, B, C, D]|
|      [E, F]|
+------------+

+------------+--------+
|       split|lastItem|
+------------+--------+
|[A, B, C, D]|       A|
|      [E, F]|       E|
+------------+--------+



In [29]:
from pyspark.sql import Row

toy_data = spark.createDataFrame([
    Row(id=1, key='a', value="123"),
    Row(id=1, key='b', value="234"),
    Row(id=1, key='c', value="345"),
    Row(id=2, key='a', value="12"),
    Row(id=2, key='x', value="23"),
    Row(id=2, key='y', value="123")])

toy_data.show()

# +---+---+-----+
# | id|key|value|
# +---+---+-----+
# |  1|  a|  123|
# |  1|  b|  234|
# |  1|  c|  345|
# |  2|  a|   12|
# |  2|  x|   23|
# |  2|  y|  123|
# +---+---+-----+

+---+---+-----+
| id|key|value|
+---+---+-----+
|  1|  a|  123|
|  1|  b|  234|
|  1|  c|  345|
|  2|  a|   12|
|  2|  x|   23|
|  2|  y|  123|
+---+---+-----+



In [54]:
upd = (
    toy_data
    .groupby('id')
    .agg(
        map_from_entries(
            collect_set( 
                struct(           
                    col("key").alias('id'), 
                    col('value').alias('value')
                )
            )
        ).alias('key_value')
    )
)

In [60]:
#print(json.dumps(json.loads(upd.schema.json()),indent=2))
upd.show()

+---+--------------------+
| id|           key_value|
+---+--------------------+
|  1|[c -> 345, a -> 1...|
|  2|[y -> 123, x -> 2...|
+---+--------------------+



In [72]:
upd = (
    toy_data
    .groupby('id')
    .agg(
        collect_set(                
            struct(           
                col("key").alias('id'), 
                col('value').alias('value')
            )
        ).alias('col_header')
    )
)

In [74]:
upd.write.json('cicaful1.json')


In [75]:
%%bash

cat cicaful1.json/*json

{"id":1,"col_header":[{"id":"c","value":"345"},{"id":"a","value":"123"},{"id":"b","value":"234"}]}
{"id":2,"col_header":[{"id":"y","value":"123"},{"id":"x","value":"23"},{"id":"a","value":"12"}]}
