In [140]:
import pyspark.sql
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext
import re

global spark

# SparkContext.setSystemProperty('spark.executor.memory', '20g')

spark = (pyspark.sql.SparkSession
    .builder
    .appName("phenodigm_parser")
    .master("local[*]")
    .config("spark.driver.bindAddress", "localhost")
    .config("spark.sql.broadcastTimeout", "36000")
    .getOrCreate()
)


print('Spark version: ', spark.version)

Spark version:  3.0.0


## Parsing `type == 'gene'`

This is the workflow:

```Python
if doc['type'] == 'gene':
    # this is a human gene
    if 'hgnc_gene_id' in doc:
        hgnc_gene_id = doc['hgnc_gene_id']
        hgnc_gene_symbol = doc['hgnc_gene_symbol']
        self.symbol2hgncids[hgnc_gene_symbol] = hgnc_gene_id
    elif 'gene_id' in doc:
        gene_id = doc['gene_id']
        gene_symbol = doc['gene_symbol']
        self.mgi2symbols[gene_id] = gene_symbol
                            
```

In [141]:
gene_json_files = 'cicaful/type.gene'


# Reading and filter files:
genes_table = (
    spark.read.json(gene_json_files)
    .select('hgnc_gene_id', 'hgnc_gene_symbol')
    .filter(col('hgnc_gene_id').isNotNull())
)

# genes_table
genes_table.show()
genes_table.count()


+------------+----------------+
|hgnc_gene_id|hgnc_gene_symbol|
+------------+----------------+
|      HGNC:5|            A1BG|
|  HGNC:37133|        A1BG-AS1|
|  HGNC:24086|            A1CF|
|      HGNC:7|             A2M|
|  HGNC:27057|         A2M-AS1|
|  HGNC:23336|           A2ML1|
|  HGNC:41022|       A2ML1-AS1|
|  HGNC:41523|       A2ML1-AS2|
|      HGNC:8|           A2MP1|
|  HGNC:30005|         A3GALT2|
|  HGNC:18149|          A4GALT|
|  HGNC:17968|           A4GNT|
|  HGNC:13666|            AAAS|
|  HGNC:21298|            AACS|
|  HGNC:18226|          AACSP1|
|     HGNC:17|           AADAC|
|  HGNC:24427|         AADACL2|
|  HGNC:50301|     AADACL2-AS1|
|  HGNC:32037|         AADACL3|
|  HGNC:32038|         AADACL4|
+------------+----------------+
only showing top 20 rows



42346

## Parsing `type == 'gene-gene'`

Workflow:

```Python
elif doc['type'] == 'gene_gene':
    # gene_id # hgnc_gene_id
    hgnc_gene_id = doc['hgnc_gene_id']
    gene_id = doc['gene_id']
    if hgnc_gene_id and not hgnc_gene_id in self.hgnc2mgis:
        self.hgnc2mgis[hgnc_gene_id] = []
    self.hgnc2mgis[hgnc_gene_id].append(gene_id)
```

In [142]:
gene_gene_json_files = 'cicaful/type.gene_gene'

# For now,I'm not sure if it make sense to do the aggregation.
# Aggregation was required because the data was stored in dictionaries
gene_gene_table = (
    spark.read.json(gene_gene_json_files)
    .select('hgnc_gene_id','gene_id')
)

## Parsing `type == 'mouse_model'`

Workflow:

```Python
elif doc['type'] == 'mouse_model':
    marker_symbol = doc['marker_symbol']
    marker_id = doc['marker_id']
    model_id = doc['model_id']

    # if there is a mouse model then add the marker id
    if marker_id not in self.mgi2symbols:
        self.mgi2symbols[marker_id] = marker_symbol

    if not marker_symbol in self.mgi2mouse_models:
        self.mgi2mouse_models[marker_symbol] = []
    self.mgi2mouse_models[marker_symbol].append(model_id)

    if not model_id in self.mouse_models:
        self.mouse_models[model_id] = doc

        model_phenotypes = []
        for raw_mp in doc['model_phenotypes']:
            mt = re.match("^(MP\:\d+)\s+", raw_mp)
            if mt:
                mp_id = mt.groups()[0]
                model_phenotypes.append(mp_id)
        self.mouse_models[model_id]['model_phenotypes'] = model_phenotypes
```

In [143]:
mouse_model_json_files = 'cicaful/type.mouse_model'

get_id = udf(
    lambda x:  re.match("^(MP\:\d+)\s+", x).groups()[0],
    StringType()
)
   

# For now,I'm not sure if it make sense to do the aggregation.
# Aggregation was required because the data was stored in dictionaries
mouse_model_table = (
    spark.read.json(mouse_model_json_files)
    .select('marker_id', 'marker_symbol', 'model_id', 'model_phenotypes') # marker id = mouse gene id
    .na.drop(subset=["marker_id"])
    .withColumn('model_phenotype', explode(col('model_phenotypes')))
    .withColumn('model_phenotype_id',get_id(col('model_phenotype')))
    .drop('model_phenotypes')
)
mouse_model_table.show()
mouse_model_table.count()

+-----------+-------------+-----------+--------------------+------------------+
|  marker_id|marker_symbol|   model_id|     model_phenotype|model_phenotype_id|
+-----------+-------------+-----------+--------------------+------------------+
|MGI:1096566|        Pias2|MGI:2678495|MP:0002160 abnorm...|        MP:0002160|
|MGI:1096566|        Pias2|MGI:2678495|MP:0004852 decrea...|        MP:0004852|
|MGI:1096566|        Pias2|MGI:2678495|MP:0002687 oligoz...|        MP:0002687|
|MGI:1096566|        Pias2|MGI:2678495|MP:0004884 abnorm...|        MP:0004884|
|MGI:1096566|        Pias2|MGI:2678495|MP:0001153 small ...|        MP:0001153|
|MGI:1096566|        Pias2|MGI:2678495|MP:0006042 increa...|        MP:0006042|
|MGI:1096566|        Pias2|MGI:2678495|MP:0001146 abnorm...|        MP:0001146|
|  MGI:94909|          Dmd|MGI:2678497|MP:0002169 no abn...|        MP:0002169|
|MGI:1354961|        Synj1|MGI:2678499|MP:0011089 perina...|        MP:0011089|
|MGI:1927868|        Pex14|MGI:2678502|M

250144

## Parsing `type == 'disease_model_summary'`

Workflow:

```Python
elif doc['type'] == 'disease_model_summary':
    model_id = doc['model_id']
    if not model_id in self.mouse_model2diseases:
        self.mouse_model2diseases[model_id] = []
    self.mouse_model2diseases[model_id].append(doc)
```

In [144]:
disease_model_summary_json_files = 'cicaful/type.disease_model_summary/'

# For now,I'm not sure if it make sense to do the aggregation.
# Aggregation was required because the data was stored in dictionaries
disease_model_table = (
    spark.read.json(disease_model_summary_json_files)
)

# (
#     disease_model_table
#     .select('association_curated', 'disease_id', 'disease_term', 'marker_id', 'marker_num_models','model_description', 'model_genetic_background', 'model_id')
#     .where(col('model_id') == 'MGI:2678495')
#     .show()
# )

print(disease_model_table.columns)
print(disease_model_table.count())
print(disease_model_table.select('model_id').distinct().count())

['association_curated', 'disease_id', 'disease_model_avg_norm', 'disease_model_avg_raw', 'disease_model_max_norm', 'disease_model_max_raw', 'disease_term', 'marker_id', 'marker_locus', 'marker_num_models', 'marker_symbol', 'model_description', 'model_genetic_background', 'model_id', 'model_source', 'type']
7016415
40280


In [145]:
print(disease_model_table.select('type').distinct().show())
print(disease_model_table.select('marker_id').distinct().count())

+--------------------+
|                type|
+--------------------+
|disease_model_sum...|
+--------------------+

None
12735


## Merging genes


Merging `human_genes` wiht `mouse_genes` and `gene_gene` tables


In [146]:
print(genes_joined.count())
print(mouse_genes.count())
print(human_genes.count())
print(gene_gene_table.count())
# print(mouse_model_table.count())

25163
303771
42346
25168


In [147]:
genes_joined.select("gene_id").collect()[17623]

Row(gene_id='MGI:1913917')

In [106]:
mouse_model_table.select('model_id').distinct().count()

52773

In [108]:
mouse_model_table.count()

250144

In [92]:
models_for_human_genes = (
    mouse_model_table
    .join(gene_gene_table, mouse_model_table.marker_id == gene_gene_table.gene_id, how='inner')
    .join(genes_table, on='hgnc_gene_id', how='inner')
)

models_for_human_genes.count()
models_for_human_genes.show()

+------------+-----------+-------------+-----------+--------------------+------------------+-----------+----------------+
|hgnc_gene_id|  marker_id|marker_symbol|   model_id|     model_phenotype|model_phenotype_id|    gene_id|hgnc_gene_symbol|
+------------+-----------+-------------+-----------+--------------------+------------------+-----------+----------------+
|  HGNC:17311|MGI:1096566|        Pias2|MGI:2678495|MP:0002160 abnorm...|        MP:0002160|MGI:1096566|           PIAS2|
|  HGNC:17311|MGI:1096566|        Pias2|MGI:2678495|MP:0004852 decrea...|        MP:0004852|MGI:1096566|           PIAS2|
|  HGNC:17311|MGI:1096566|        Pias2|MGI:2678495|MP:0002687 oligoz...|        MP:0002687|MGI:1096566|           PIAS2|
|  HGNC:17311|MGI:1096566|        Pias2|MGI:2678495|MP:0004884 abnorm...|        MP:0004884|MGI:1096566|           PIAS2|
|  HGNC:17311|MGI:1096566|        Pias2|MGI:2678495|MP:0001153 small ...|        MP:0001153|MGI:1096566|           PIAS2|
|  HGNC:17311|MGI:109656

In [102]:
disease_model_table.where(col('model_id') == 'MGI:2585941').show()

+-------------------+----------+----------------------+---------------------+----------------------+---------------------+------------+---------+------------+-----------------+-------------+-----------------+------------------------+--------+------------+----+
|association_curated|disease_id|disease_model_avg_norm|disease_model_avg_raw|disease_model_max_norm|disease_model_max_raw|disease_term|marker_id|marker_locus|marker_num_models|marker_symbol|model_description|model_genetic_background|model_id|model_source|type|
+-------------------+----------+----------------------+---------------------+----------------------+---------------------+------------+---------+------------+-----------------+-------------+-----------------+------------------------+--------+------------+----+
+-------------------+----------+----------------------+---------------------+----------------------+---------------------+------------+---------+------------+-----------------+-------------+-----------------+---------

In [75]:
models_table_joined.select('hgnc_gene_id').distinct().count()

13109

In [78]:
disease_model_table.columns

['association_curated',
 'disease_id',
 'disease_model_avg_norm',
 'disease_model_avg_raw',
 'disease_model_max_norm',
 'disease_model_max_raw',
 'disease_term',
 'marker_id',
 'marker_locus',
 'marker_num_models',
 'marker_symbol',
 'model_description',
 'model_genetic_background',
 'model_id',
 'model_source',
 'type']

In [80]:
disease_model_table.select('marker_id','marker_symbol').show()

+-----------+-------------+
|  marker_id|marker_symbol|
+-----------+-------------+
|  MGI:96575|         Insr|
|  MGI:98419|          Sri|
|MGI:1920230|        Wdr11|
|MGI:1920230|        Wdr11|
|  MGI:97394|          Oat|
|  MGI:95657|         Gas2|
|MGI:2676312|       Abca12|
|MGI:1921393|         Opa1|
|  MGI:88378|        Ces1g|
|  MGI:96217|         Hprt|
|MGI:2147834|       Slc6a8|
|MGI:1934606|        Alms1|
|MGI:6198564|        conls|
|MGI:1098827|        Reep1|
|MGI:2442833|         Bbs9|
|MGI:3828086|        Zbed6|
|MGI:1917706|         Mpc2|
|MGI:1924956|        Abcb5|
|MGI:3588197|         Vrtn|
|MGI:1859152|       Pla2g6|
+-----------+-------------+
only showing top 20 rows



## Exploring the mouse model data

In [148]:
mouse_model_json_files = 'cicaful/type.mouse_model'


# For now,I'm not sure if it make sense to do the aggregation.
# Aggregation was required because the data was stored in dictionaries
mouse_model_table = (
    spark.read.json(mouse_model_json_files)
)


# Number of rows in the models table:
print(mouse_model_table.count())

mouse_model_table.show()

52773
+-----------+-------------+--------------------+------------------------+-----------+--------------------+------------+-----------+
|  marker_id|marker_symbol|   model_description|model_genetic_background|   model_id|    model_phenotypes|model_source|       type|
+-----------+-------------+--------------------+------------------------+-----------+--------------------+------------+-----------+
|MGI:1096566|        Pias2|Pias2<Gt(pT1Betag...|    involves: 129P2/O...|MGI:2678495|[MP:0002160 abnor...|         MGI|mouse_model|
|  MGI:94909|          Dmd|Dmd<Gt(pT1ATGBeta...|    involves: 129S2/S...|MGI:2678497|[MP:0002169 no ab...|         MGI|mouse_model|
|MGI:1354961|        Synj1|Synj1<Gt(pT1ATGBe...|    involves: 129S2/S...|MGI:2678499|[MP:0011089 perin...|         MGI|mouse_model|
|MGI:1927868|        Pex14|Pex14<Gt(pT1Betag...|    involves: 129S2/S...|MGI:2678502|[MP:0011091 prena...|         MGI|mouse_model|
|MGI:1321395|        Ltbp4|Ltbp4<Gt(U3Cre)1V...|    involves: 129S2/S.

In [119]:
# Number of unique model id:
print(mouse_model_table.select('model_id').distinct().count())
print(mouse_model_table.select('model_source').distinct().show())

52773
+--------------------+
|        model_source|
+--------------------+
|     EuroPhenome,MGP|
|EuroPhenome,3i,IM...|
|              3i,MGP|
|         3i,IMPC,MGP|
|             3i,IMPC|
|                 MGP|
|                IMPC|
|  EuroPhenome,3i,MGP|
|    EuroPhenome,IMPC|
|      EuroPhenome,3i|
|                  3i|
| EuroPhenome,3i,IMPC|
|         EuroPhenome|
|                 MGI|
|            IMPC,MGP|
+--------------------+

None


In [131]:
# mouse_model_table.where(col('model_source')=='EuroPhenome,3i').select('model_id','marker_id','model_description').toPandas().model_id
(
    mouse_model_table
    .where(col('model_id') ==  'MGI:4363861#het#early')
    .show()
)

+-----------+-------------+--------------------+------------------------+--------------------+--------------------+------------+-----------+
|  marker_id|marker_symbol|   model_description|model_genetic_background|            model_id|    model_phenotypes|model_source|       type|
+-----------+-------------+--------------------+------------------------+--------------------+--------------------+------------+-----------+
|MGI:2385884|        Ddx27|Ddx27<tm1a(KOMP)W...|             C57BL/6NTac|MGI:4363861#het#e...|[MP:0001488 incre...|    IMPC,MGP|mouse_model|
+-----------+-------------+--------------------+------------------------+--------------------+--------------------+------------+-----------+



In [128]:
disease_model_table.where(col('model_source')=='IMPC,MGP').select('model_id').limit(20).toPandas().model_id.to_list()

['MGI:4432914#hom#early',
 'MGI:4363861#het#early',
 'MGI:4364810#het#early',
 'MGI:4363861#het#early',
 'MGI:4432914#hom#early',
 'MGI:4432914#hom#early',
 'MGI:4432914#hom#early',
 'MGI:4363861#het#early',
 'MGI:4432914#hom#early',
 'MGI:4431547#het#early',
 'MGI:4363861#het#early',
 'MGI:4363861#het#early',
 'MGI:4432914#hom#early',
 'MGI:4432914#hom#early',
 'MGI:4363861#het#early',
 'MGI:4364794#hom#early',
 'MGI:4432914#hom#early',
 'MGI:4363861#het#early',
 'MGI:4431677#het#early',
 'MGI:4432914#hom#early']

In [133]:
mouse_model_table = (
    mouse_model_table
    .select('model_id','model_phenotypes')
)

+--------------------+------------------------+-----------+--------------------+
|   model_description|model_genetic_background|   model_id|    model_phenotypes|
+--------------------+------------------------+-----------+--------------------+
|Pias2<Gt(pT1Betag...|    involves: 129P2/O...|MGI:2678495|[MP:0002160 abnor...|
|Dmd<Gt(pT1ATGBeta...|    involves: 129S2/S...|MGI:2678497|[MP:0002169 no ab...|
|Synj1<Gt(pT1ATGBe...|    involves: 129S2/S...|MGI:2678499|[MP:0011089 perin...|
|Pex14<Gt(pT1Betag...|    involves: 129S2/S...|MGI:2678502|[MP:0011091 prena...|
|Ltbp4<Gt(U3Cre)1V...|    involves: 129S2/S...|MGI:2678503|[MP:0005330 cardi...|
|Aqr<Gt(pT1Betageo...|    involves: 129S2/S...|MGI:2678504|[MP:0002169 no ab...|
|Nphs1<Gt(pT1Betag...|    involves: 129P2/O...|MGI:2678505|[MP:0011353 expan...|
|Nphs1<Gt(pT1Betag...|    involves: 129P2/O...|MGI:2678506|[MP:0008139 fused...|
|Ncdn<Gt(pT1Betage...|    involves: 129S2/S...|MGI:2678507|[MP:0013454 lacri...|
|Xbp1<tm1Nogu>/Xbp...|    in

In [138]:
disease_model_table = (
    disease_model_table
    .drop(*['association_curated', 'marker_locus', 'marker_symbol','model_source','type', 'disease_model_avg_norm'])
)

disease_model_table.show()

+-----------+---------------------+----------------------+---------------------+--------------------+-----------+-----------------+--------------------+------------------------+-----------+
| disease_id|disease_model_avg_raw|disease_model_max_norm|disease_model_max_raw|        disease_term|  marker_id|marker_num_models|   model_description|model_genetic_background|   model_id|
+-----------+---------------------+----------------------+---------------------+--------------------+-----------+-----------------+--------------------+------------------------+-----------+
|ORPHA:90970|                 0.76|                 76.35|                 2.38|Primary Lipodystr...|MGI:1330812|                3|Acox1<lampe1>/Aco...|      involves: C57BL/6J|MGI:5285941|
|ORPHA:90970|                 0.36|                 70.89|                 2.21|Primary Lipodystr...|  MGI:88138|               10|Bcl2<tm2.1Lbox>/B...|    involves: 129S1/S...|MGI:5288501|
|ORPHA:90970|                 0.36|               

In [150]:
models_diseases_joined = (
    disease_model_table.limit(10000)
    .join(mouse_model_table, on='model_id', how='inner')
)

print(models_diseases_joined.count())
print(models_diseases_joined.show())

10000


Py4JJavaError: An error occurred while calling o1626.showString.
: java.util.concurrent.ExecutionException: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 507.0 failed 1 times, most recent failure: Lost task 0.0 in stage 507.0 (TID 7367, c02zq14flvdn, executor driver): TaskResultLost (result lost from block manager)
Driver stacktrace:
	at java.util.concurrent.FutureTask.report(FutureTask.java:122)
	at java.util.concurrent.FutureTask.get(FutureTask.java:206)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:161)
	at org.apache.spark.sql.execution.InputAdapter.doExecuteBroadcast(WholeStageCodegenExec.scala:515)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeBroadcast$1(SparkPlan.scala:188)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.executeBroadcast(SparkPlan.scala:184)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareBroadcast(BroadcastHashJoinExec.scala:116)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.codegenInner(BroadcastHashJoinExec.scala:210)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doConsume(BroadcastHashJoinExec.scala:100)
	at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:194)
	at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:149)
	at org.apache.spark.sql.execution.ProjectExec.consume(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:71)
	at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:194)
	at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:149)
	at org.apache.spark.sql.execution.FilterExec.consume(basicPhysicalOperators.scala:97)
	at org.apache.spark.sql.execution.FilterExec.doConsume(basicPhysicalOperators.scala:222)
	at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:194)
	at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:149)
	at org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.InputRDDCodegen.doProduce(WholeStageCodegenExec.scala:483)
	at org.apache.spark.sql.execution.InputRDDCodegen.doProduce$(WholeStageCodegenExec.scala:456)
	at org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.FilterExec.doProduce(basicPhysicalOperators.scala:137)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.FilterExec.produce(basicPhysicalOperators.scala:97)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:95)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:39)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:632)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:692)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:316)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:434)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:420)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3625)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2695)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3616)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3614)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2695)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2902)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:300)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:337)
	at sun.reflect.GeneratedMethodAccessor101.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 507.0 failed 1 times, most recent failure: Lost task 0.0 in stage 507.0 (TID 7367, c02zq14flvdn, executor driver): TaskResultLost (result lost from block manager)
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2023)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1972)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1971)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1971)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:950)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:950)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:950)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2203)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2152)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2141)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:752)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2133)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.sql.execution.SparkPlan.executeCollectIterator(SparkPlan.scala:392)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.$anonfun$relationFuture$1(BroadcastExchangeExec.scala:86)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$1(SQLExecution.scala:182)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more
