In [11]:
from pyspark.sql import SparkSession, functions as f, types as t, Column, DataFrame
from pyspark.sql.window import Window
from graphframes import GraphFrame

spark = SparkSession.builder.getOrCreate()

# GraphFrames needs this:
spark.sparkContext.setCheckpointDir('<pwd_output>')


data = [
    ('s1', 'v1', 'v1'),
    ('s1', 'v2', 'v2'),
    ('s1', 'v3', 'v1'),
    ('s1', 'v4', 'v1'),
    ('s1', 'v5', 'v3'),
    ('s1', 'v6', 'v3'),
    ('s1', 'v6', 'v2'),
    ('s1', 'v7', 'v7'),
    ('s1', 'v8', 'v7'),
    ('s1', 'v9', 'v5'),
]

colnames = ['studyId', 'variantId', 'explainedBy']

df = spark.createDataFrame(data, colnames).persist()
df.show()


# Convert to vertices:
nodes = (
    df
    .select(
        'studyId',
        f.col('variantId').alias('id'),
        f.when(f.col('variantId') == f.col('explainedBy'), 'root').otherwise('explained').alias('leadType')
    )
    .distinct()
    .persist()
)
nodes.show()


# Convert to edges (more significant points to less significant):
edges = (
    df
    .select(
        f.col('variantId').alias('src'),
        f.col('explainedBy').alias('dst'),
        f.lit('explains').alias('edgeType')
    )
    .distinct()
    .persist()
)
edges.show()

graph = GraphFrame(nodes, edges)

23/02/01 14:07:53 WARN org.apache.spark.SparkContext: Spark is not running in local mode, therefore the checkpoint directory must not be on the local filesystem. Directory '<pwd_output>' appears to be on the local filesystem.


+-------+---------+-----------+
|studyId|variantId|explainedBy|
+-------+---------+-----------+
|     s1|       v1|         v1|
|     s1|       v2|         v2|
|     s1|       v3|         v1|
|     s1|       v4|         v1|
|     s1|       v5|         v3|
|     s1|       v6|         v3|
|     s1|       v6|         v2|
|     s1|       v7|         v7|
|     s1|       v8|         v7|
|     s1|       v9|         v5|
+-------+---------+-----------+

+-------+---+---------+
|studyId| id| leadType|
+-------+---+---------+
|     s1| v3|explained|
|     s1| v4|explained|
|     s1| v6|explained|
|     s1| v2|     root|
|     s1| v8|explained|
|     s1| v1|     root|
|     s1| v7|     root|
|     s1| v9|explained|
|     s1| v5|explained|
+-------+---+---------+

+---+---+--------+
|src|dst|edgeType|
+---+---+--------+
| v6| v2|explains|
| v9| v5|explains|
| v2| v2|explains|
| v5| v3|explains|
| v8| v7|explains|
| v7| v7|explains|
| v6| v3|explains|
| v4| v1|explains|
| v3| v1|explains|
| v1| v1|e

In [13]:

paths = graph.bfs('leadType == "explained"', 'leadType == "root"', maxPathLength=30)
paths.show(100)

+-------------------+------------------+--------------+
|               from|                e0|            to|
+-------------------+------------------+--------------+
|{s1, v3, explained}|{v3, v1, explains}|{s1, v1, root}|
|{s1, v4, explained}|{v4, v1, explains}|{s1, v1, root}|
|{s1, v6, explained}|{v6, v2, explains}|{s1, v2, root}|
|{s1, v8, explained}|{v8, v7, explains}|{s1, v7, root}|
+-------------------+------------------+--------------+



In [19]:
(
    graph
    .connectedComponents()
    .withColumn(
        'explainedBy',
        f.collect_set(f.when(f.col('leadType') =='root', f.col('id'))).over(Window.partitionBy('component'))
    )
    .show()
)

23/02/01 14:56:36 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
23/02/01 14:56:36 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
23/02/01 14:56:36 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
23/02/01 14:56:36 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
23/02/01 14:56:36 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
23/02/01 14:56:37 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
23/02/01 14:56:37 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
23/02/01 14:56:37 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
                                                                                

+-------+---+---------+------------+-----------+
|studyId| id| leadType|   component|explainedBy|
+-------+---+---------+------------+-----------+
|     s1| v4|explained|309237645312|   [v2, v1]|
|     s1| v2|     root|309237645312|   [v2, v1]|
|     s1| v5|explained|309237645312|   [v2, v1]|
|     s1| v3|explained|309237645312|   [v2, v1]|
|     s1| v6|explained|309237645312|   [v2, v1]|
|     s1| v1|     root|309237645312|   [v2, v1]|
|     s1| v9|explained|309237645312|   [v2, v1]|
|     s1| v7|     root|206158430208|       [v7]|
|     s1| v8|explained|206158430208|       [v7]|
+-------+---+---------+------------+-----------+



In [23]:
study = ''
ld_set = spark.read.parquet("gs://genetics_etl_python_playground/XX.XX/output/python_etl/parquet/pics_credible_set_not_clumped/")
# ld_set.show()


dataset = (
    ld_set
    .select('studyId', 'variantId', 'pValueMantissa', 'pValueExponent', 'tagVariantId', 'R_overall', 'qualityControl')
    .distinct()
    .filter(f.col('studyId') == 'GCST004860')
    .persist()
)

dataset.show()



+----------+----------------+--------------+--------------+--------------------+---------+--------------------+
|   studyId|       variantId|pValueMantissa|pValueExponent|        tagVariantId|R_overall|      qualityControl|
+----------+----------------+--------------+--------------+--------------------+---------+--------------------+
|GCST004860|10_119029751_G_C|           6.0|            -6|    10_119112913_G_A|-0.789116|[Subsignificant p...|
|GCST004860|10_119134504_C_T|           6.0|            -6|    10_119029218_G_A|-0.789984|[Subsignificant p...|
|GCST004860|10_119064457_T_C|           2.0|            -7|10_119044943_CTTT...| 0.847645|[Subsignificant p...|
|GCST004860| X_107370118_A_G|           2.0|            -7|     X_107363685_T_G| 0.818858|[Subsignificant p...|
|GCST004860| X_107370118_A_G|           2.0|            -7|    X_107149542_CA_C| 0.745002|[Subsignificant p...|
|GCST004860| X_107370118_A_G|           2.0|            -7|     X_107107919_A_G|  0.73615|[Subsignifican

                                                                                