In [10]:
from pyspark.sql import SparkSession, functions as f, types as t, DataFrame, Column
from pyspark.sql.window import Window

spark = SparkSession.builder.getOrCreate()

study_file = 'gs://ot-team/dsuveges/pre-split-gwas-studies'
association_file = 'gs://genetics_etl_python_playground/XX.XX/output/python_etl/parquet/gwas_catalog_associations'

study_df = spark.read.parquet(study_file).persist()
assoc_df = spark.read.parquet(association_file).persist()

22/11/28 14:56:31 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.


In [11]:
study_df.show(1, False, True)
assoc_df.show(1, False, True)


[Stage 14:>                                                         (0 + 1) / 1]

-RECORD 0--------------------------------------------------------------------------------
 studyAccession       | GCST000290                                                       
 pubmedId             | 19060906                                                         
 firstAuthor          | Kathiresan S                                                     
 publicationDate      | 2008-12-07                                                       
 journal              | Nat Genet                                                        
 study                | Common variants at 30 loci contribute to polygenic dyslipidemia. 
 studyDiseaseTrait    | HDL cholesterol                                                  
 studyEfos            | [EFO_0004612]                                                    
 backgroundEfos       | null                                                             
 initialSampleSize    | 19,840 European ancestry individuals                             
 nCases   

                                                                                

In [5]:
def spliting_gwas_studies(study_association: DataFrame) -> DataFrame:
    """Splitting studies and consolidating disease annotation.

    Processing disease annotation of the joined study/association table. If assigned disease
    of the study and the association don't agree, we assume the study needs to be split.
    Then disease EFOs, trait names and study i are consolidated

    Args:
        study_association (DataFrame): DataFrame

    Returns:
        A dataframe with the studyAccession, studyId, DiseaseTrait, and efos columns.
    """
    # Windowing throught all study accessions, while ordering by association EFOs:
    window_spec = Window.partitionBy("studyAccession").orderBy("associationEfos")

    return (
        study_association
        # Assign ranks for each association EFO group within a studyAccession group:
        .withColumn("row_number", f.dense_rank().over(window_spec) - 1)
        # Study identifiers are split when there are more than one type of associationEfos:
        .withColumn(
            "studyId",
            f.when(f.col("row_number") == 0, f.col("studyAccession")).otherwise(
                f.concat_ws("_", "studyAccession", "row_number")
            ),
        )
        # Disese traits are generated based on p-value text when splitting study:
        .withColumn(
            "DiseaseTrait",
            # When study is split:
            f.when(
                f.col("row_number") != 0,
                f.concat_ws(" ", "associationDiseaseTrait", "pValueText"),
            )
            # When there's association disease trait:
            .when(
                f.col("associationDiseaseTrait").isNotNull(),
                f.col("associationDiseaseTrait"),
            )
            # When no association disease trait is present we get from study:
            .otherwise(f.col("studyDiseaseTrait")),
        )
        # The EFO field is also consolidated:
        .withColumn(
            "efos",
            # When available, EFOs are pulled from associations:
            f.when(f.col("associationEfos").isNotNull(), f.col("associationEfos"))
            # When no association is given, the study level EFOs are used:
            .otherwise(f.col("studyEfos")),
        )
        # The fields are dropped that we would no longer need downstream:
        .drop(
            "row_number",
            "studyAccession",
            "studyEfos",
            "studyDiseaseTrait",
            "associationEfos",
            "associationDiseaseTrait",
            "pValueText",
        )
        .orderBy("studyAccession")
        .persist()
    )


In [12]:
study_assoc = (
    assoc_df.join(study_df, on='studyAccession', how='outer').persist()
)

study_assoc.filter(f.col('pValueText').isNotNull()).show(1, False, True)

[Stage 18:>                                                         (0 + 1) / 1]

-RECORD 0-----------------------------------------------------------------------------------------------------
 studyAccession          | GCST000101                                                                         
 chromosome              | null                                                                               
 position                | null                                                                               
 referenceAllele         | null                                                                               
 alternateAllele         | null                                                                               
 variantId               | null                                                                               
 AssociationEfos         | [EFO_0004685]                                                                      
 associationDiseaseTrait | Hip geometry                                                                       
 

                                                                                

In [9]:
window_spec = Window.partitionBy("studyAccession").orderBy("pValueText")



-RECORD 0--------------------------------------------------------------------------------
 studyAccession       | GCST000290                                                       
 pubmedId             | 19060906                                                         
 firstAuthor          | Kathiresan S                                                     
 publicationDate      | 2008-12-07                                                       
 journal              | Nat Genet                                                        
 study                | Common variants at 30 loci contribute to polygenic dyslipidemia. 
 studyDiseaseTrait    | HDL cholesterol                                                  
 studyEfos            | [EFO_0004612]                                                    
 backgroundEfos       | null                                                             
 initialSampleSize    | 19,840 European ancestry individuals                             
 nCases   