In [11]:
from pyspark.sql import SparkSession, functions as f, types as t, Column, DataFrame
from pyspark.sql.window import Window
from graphframes import GraphFrame

spark = SparkSession.builder.getOrCreate()

# GraphFrames needs this:
spark.sparkContext.setCheckpointDir('<pwd_output>')


data = [
    ('s1', 'v1', 'v1'),
    ('s1', 'v2', 'v2'),
    ('s1', 'v3', 'v1'),
    ('s1', 'v4', 'v1'),
    ('s1', 'v5', 'v3'),
    ('s1', 'v6', 'v3'),
    ('s1', 'v6', 'v2'),
    ('s1', 'v7', 'v7'),
    ('s1', 'v8', 'v7'),
    ('s1', 'v9', 'v5'),
]

colnames = ['studyId', 'variantId', 'explainedBy']

df = spark.createDataFrame(data, colnames).persist()
df.show()


# Convert to vertices:
nodes = (
    df
    .select(
        'studyId',
        f.col('variantId').alias('id'),
        f.when(f.col('variantId') == f.col('explainedBy'), 'root').otherwise('explained').alias('leadType')
    )
    .distinct()
    .persist()
)
nodes.show()


# Convert to edges (more significant points to less significant):
edges = (
    df
    .select(
        f.col('variantId').alias('src'),
        f.col('explainedBy').alias('dst'),
        f.lit('explains').alias('edgeType')
    )
    .distinct()
    .persist()
)
edges.show()

graph = GraphFrame(nodes, edges)

23/02/01 14:07:53 WARN org.apache.spark.SparkContext: Spark is not running in local mode, therefore the checkpoint directory must not be on the local filesystem. Directory '<pwd_output>' appears to be on the local filesystem.


+-------+---------+-----------+
|studyId|variantId|explainedBy|
+-------+---------+-----------+
|     s1|       v1|         v1|
|     s1|       v2|         v2|
|     s1|       v3|         v1|
|     s1|       v4|         v1|
|     s1|       v5|         v3|
|     s1|       v6|         v3|
|     s1|       v6|         v2|
|     s1|       v7|         v7|
|     s1|       v8|         v7|
|     s1|       v9|         v5|
+-------+---------+-----------+

+-------+---+---------+
|studyId| id| leadType|
+-------+---+---------+
|     s1| v3|explained|
|     s1| v4|explained|
|     s1| v6|explained|
|     s1| v2|     root|
|     s1| v8|explained|
|     s1| v1|     root|
|     s1| v7|     root|
|     s1| v9|explained|
|     s1| v5|explained|
+-------+---+---------+

+---+---+--------+
|src|dst|edgeType|
+---+---+--------+
| v6| v2|explains|
| v9| v5|explains|
| v2| v2|explains|
| v5| v3|explains|
| v8| v7|explains|
| v7| v7|explains|
| v6| v3|explains|
| v4| v1|explains|
| v3| v1|explains|
| v1| v1|e

In [13]:

paths = graph.bfs('leadType == "explained"', 'leadType == "root"', maxPathLength=30)
paths.show(100)

+-------------------+------------------+--------------+
|               from|                e0|            to|
+-------------------+------------------+--------------+
|{s1, v3, explained}|{v3, v1, explains}|{s1, v1, root}|
|{s1, v4, explained}|{v4, v1, explains}|{s1, v1, root}|
|{s1, v6, explained}|{v6, v2, explains}|{s1, v2, root}|
|{s1, v8, explained}|{v8, v7, explains}|{s1, v7, root}|
+-------------------+------------------+--------------+



In [19]:
(
    graph
    .connectedComponents()
    .withColumn(
        'explainedBy',
        f.collect_set(f.when(f.col('leadType') =='root', f.col('id'))).over(Window.partitionBy('component'))
    )
    .show()
)

23/02/01 14:56:36 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
23/02/01 14:56:36 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
23/02/01 14:56:36 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
23/02/01 14:56:36 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
23/02/01 14:56:36 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
23/02/01 14:56:37 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
23/02/01 14:56:37 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
23/02/01 14:56:37 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
                                                                                

+-------+---+---------+------------+-----------+
|studyId| id| leadType|   component|explainedBy|
+-------+---+---------+------------+-----------+
|     s1| v4|explained|309237645312|   [v2, v1]|
|     s1| v2|     root|309237645312|   [v2, v1]|
|     s1| v5|explained|309237645312|   [v2, v1]|
|     s1| v3|explained|309237645312|   [v2, v1]|
|     s1| v6|explained|309237645312|   [v2, v1]|
|     s1| v1|     root|309237645312|   [v2, v1]|
|     s1| v9|explained|309237645312|   [v2, v1]|
|     s1| v7|     root|206158430208|       [v7]|
|     s1| v8|explained|206158430208|       [v7]|
+-------+---+---------+------------+-----------+



In [23]:
study = ''
ld_set = spark.read.parquet("gs://genetics_etl_python_playground/XX.XX/output/python_etl/parquet/pics_credible_set_not_clumped/")
# ld_set.show()


dataset = (
    ld_set
    .select('studyId', 'variantId', 'pValueMantissa', 'pValueExponent', 'tagVariantId', 'R_overall', 'qualityControl')
    .distinct()
    .filter(f.col('studyId') == 'GCST004860')
    .persist()
)

dataset.show()



+----------+----------------+--------------+--------------+--------------------+---------+--------------------+
|   studyId|       variantId|pValueMantissa|pValueExponent|        tagVariantId|R_overall|      qualityControl|
+----------+----------------+--------------+--------------+--------------------+---------+--------------------+
|GCST004860|10_119029751_G_C|           6.0|            -6|    10_119112913_G_A|-0.789116|[Subsignificant p...|
|GCST004860|10_119134504_C_T|           6.0|            -6|    10_119029218_G_A|-0.789984|[Subsignificant p...|
|GCST004860|10_119064457_T_C|           2.0|            -7|10_119044943_CTTT...| 0.847645|[Subsignificant p...|
|GCST004860| X_107370118_A_G|           2.0|            -7|     X_107363685_T_G| 0.818858|[Subsignificant p...|
|GCST004860| X_107370118_A_G|           2.0|            -7|    X_107149542_CA_C| 0.745002|[Subsignificant p...|
|GCST004860| X_107370118_A_G|           2.0|            -7|     X_107107919_A_G|  0.73615|[Subsignifican

                                                                                

In [3]:
from pyspark.sql import SparkSession, functions as f, types as t, Column, DataFrame
from pyspark.sql.window import Window

spark = SparkSession.builder.getOrCreate()
associations = spark.read.parquet('gs://genetics_etl_python_playground/XX.XX/output/python_etl/parquet/gwas_catalog_associations/').persist()
associations.show(1, False, True)

(
    associations
    .select(f.explode('qualityControl').alias('QC'))
    .distinct()
    .show(1000, truncate=False)
)

23/02/06 16:25:10 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.


-RECORD 0---------------------------------------
 chromosome          | 1                        
 position            | 4255144                  
 referenceAllele     | G                        
 alternateAllele     | T                        
 variantId           | 1_4255144_G_T            
 studyId             | GCST000082               
 pValueMantissa      | 8.0                      
 pValueExponent      | -6                       
 beta                | null                     
 beta_ci_lower       | null                     
 beta_ci_upper       | null                     
 odds_ratio          | null                     
 odds_ratio_ci_lower | null                     
 odds_ratio_ci_upper | null                     
 qualityControl      | [Subsignificant p-value] 
only showing top 1 row





+-------------------------------------+
|QC                                   |
+-------------------------------------+
|Composite association                |
|Variant inconsistency                |
|No mapping in GnomAd                 |
|Palindrome alleles - cannot harmonize|
|Incomplete genomic mapping           |
|Ambiguous association                |
|Subsignificant p-value               |
+-------------------------------------+



                                                                                

In [9]:
(
    associations
    .filter(f.array_contains(f.col('qualityControl'), 'Incomplete genomic mapping'))
    .select('studyId', 'variantId', 'chromosome','position', 'pValueMantissa', 'pValueExponent')
    .show(5, truncate=False)
)

+------------+--------------+----------+--------+--------------+--------------+
|studyId     |variantId     |chromosome|position|pValueMantissa|pValueExponent|
+------------+--------------+----------+--------+--------------+--------------+
|GCST001356  |rs2070676     |null      |null    |1.0           |-7            |
|GCST002337  |kgp20555366   |null      |null    |3.0           |-6            |
|GCST002337  |kgp6466428    |null      |null    |6.0           |-7            |
|GCST002594_1|chr6:7026945  |null      |null    |8.0           |-6            |
|GCST002927  |chr17:43704790|null      |null    |6.0           |-6            |
+------------+--------------+----------+--------+--------------+--------------+
only showing top 5 rows



In [36]:
(
    associations
    .filter(
        f.array_contains(f.col('qualityControl'), 'Ambiguous association')
        & (~f.array_contains(f.col('qualityControl'),'No mapping in GnomAd'))
    )
    .select('studyId', 'variantId', 'chromosome','position', 'pValueMantissa', 'pValueExponent')
    # .show(40, truncate=False)
    .count()
)

702

In [31]:
(
    associations
    .filter(
        f.array_contains(f.col('qualityControl'), 'Palindrome alleles - cannot harmonize') 
    )
    .select('studyId', 'variantId', 'chromosome','position', 'referenceAllele', 'alternateAllele')
    .show(10, truncate=False)
    # .count()
)

+-------------+---------------+----------+---------+---------------+---------------+
|studyId      |variantId      |chromosome|position |referenceAllele|alternateAllele|
+-------------+---------------+----------+---------+---------------+---------------+
|GCST000618_4 |5_91003406_G_C |5         |91003406 |G              |C              |
|GCST000703   |12_4497002_T_A |12        |4497002  |T              |A              |
|GCST000712   |6_20687890_T_A |6         |20687890 |T              |A              |
|GCST001308_2 |2_14257928_G_C |2         |14257928 |G              |C              |
|GCST001356   |20_46604118_T_A|20        |46604118 |T              |A              |
|GCST001524_1 |9_26208859_G_C |9         |26208859 |G              |C              |
|GCST001848_17|5_157388755_G_C|5         |157388755|G              |C              |
|GCST001850_7 |12_13367062_G_C|12        |13367062 |G              |C              |
|GCST002221   |1_234722850_A_T|1         |234722850|A            

In [39]:
ld = spark.read.parquet('gs://genetics_etl_python_playground/XX.XX/output/python_etl/parquet/pics_credible_set_clumped/').persist()
ld.show()

[Stage 74:>                                                         (0 + 1) / 1]

+----------+--------------------+------------+--------+---------------+---------------+--------------+--------------+----------+-------------------+--------------------+----------+-------------------+-------------------+--------------------+------------+---------+-------+--------+-------------+--------------------+--------------------+
|chromosome|           variantId|     studyId|position|referenceAllele|alternateAllele|pValueMantissa|pValueExponent|      beta|      beta_ci_lower|       beta_ci_upper|odds_ratio|odds_ratio_ci_lower|odds_ratio_ci_upper|      qualityControl|tagVariantId|R_overall|pics_mu|pics_std|pics_postprob|pics_95_perc_credset|pics_99_perc_credset|
+----------+--------------------+------------+--------+---------------+---------------+--------------+--------------+----------+-------------------+--------------------+----------+-------------------+-------------------+--------------------+------------+---------+-------+--------+-------------+--------------------+--------

                                                                                

In [42]:
(
    ld
    .select(f.explode('qualityControl').alias('col'))
    .withColumn('col', f.when(f.col('col').startswith('Association explained'),'Association explained').otherwise(f.col('col')))    
    .distinct()
    .show(10, truncate=False)
)



+-------------------------------------+
|col                                  |
+-------------------------------------+
|Composite association                |
|Variant inconsistency                |
|No mapping in GnomAd                 |
|Palindrome alleles - cannot harmonize|
|Incomplete genomic mapping           |
|Association explained                |
|Ambiguous association                |
|Credible set not resolved            |
|Subsignificant p-value               |
+-------------------------------------+



                                                                                

In [44]:
(
    ld
    .filter(
        f.col('tagVariantId').isNotNull() & ~f.array_contains(f.col('qualityControl'), 'Subsignificant p-value')
    )
    .select('studyId', 'variantId')
    .distinct()
    .count()
)

                                                                                

281972