In [1]:
import pyspark.sql
import pyspark.sql.types as t
import pyspark.sql.functions as f

global spark

spark = (pyspark.sql.SparkSession
       .builder
       .master("local[*]")
       .getOrCreate()
)

In [94]:
associations = 'gwas_associations_2022-05-27.tsv'
studies = 'gwas_studies_new_2022-05-27.tsv'
studies_unpublished = 'gwas_unpublished_studies_2022-05-27.tsv'
ancestries = 'gwas_ancestries_new_2022-05-27.tsv'
ancestries_unpublished = 'gwas_unpublished_ancestries_2022-05-27.tsv'

# Read and combine the two ancesty files:
ancestries_df = (
    spark.read.csv(ancestries, sep='\t', header=True)
    .union(
        spark.read.csv(ancestries_unpublished, sep='\t', header=True)
    )
    .drop('PUBMED ID', 'FIRST AUTHOR', 'DATE')
    .persist()
)

# Read and combined the two study files:
studies_df = (
    spark.read.csv(studies, sep='\t', header=True)
    .union(
        spark.read.csv(studies_unpublished, sep='\t', header=True)
        .withColumnRenamed('MAPPED_TRAIT_URI', 'MAPPED TRAIT URI')
        .withColumnRenamed('MAPPED_TRAIT', 'MAPPED TRAIT')
    )
)

# Group by study id and calculate number:
joined = (
    studies_df
    .join(ancestries_df, how='outer', on='STUDY ACCESSION')
    .persist()
)

joined.count()

63052

In [95]:
DROPPED_STUDIES = [
    'GCST005806', # Sun et al pQTL study
    'GCST005837' # Huang et al IBD study
]

(
    joined
    .filter(~f.col('STUDY ACCESSION').isin(DROPPED_STUDIES))
    .count()
)

63050

In [96]:
ancestries_columns = ancestries_df.columns
study_columns = spark.read.csv(studies, sep='\t', header=True).columns

[col for col in ancestries_columns if col in study_columns]

['STUDY ACCESSION']

In [97]:
(
    ancestries_df
    .select('STUDY ACCESSION', 'INITIAL SAMPLE DESCRIPTION', 
            'REPLICATION SAMPLE DESCRIPTION', 'NUMBER OF INDIVIDUALS')
    .show()
)
# .show(1, False, True)

+---------------+--------------------------+------------------------------+---------------------+
|STUDY ACCESSION|INITIAL SAMPLE DESCRIPTION|REPLICATION SAMPLE DESCRIPTION|NUMBER OF INDIVIDUALS|
+---------------+--------------------------+------------------------------+---------------------+
|     GCST009498|      up to 182,902 Eur...|                            NA|               182902|
|     GCST009499|      up to 182,902 Eur...|                            NA|               182902|
|     GCST009500|      up to 182,902 Eur...|                            NA|               182902|
|     GCST009638|      5,321 European an...|          2,237 cases, 3,46...|                 5698|
|     GCST009638|      5,321 European an...|          2,237 cases, 3,46...|                21987|
|     GCST009637|      1,369 European an...|          422 cases, 3,461 ...|                 3883|
|     GCST009637|      1,369 European an...|          422 cases, 3,461 ...|                18035|
|     GCST009636|   

In [98]:
import re

@f.udf(t.StructType([
    t.StructField('n_initial', t.IntegerType()),
    t.StructField('n_cases', t.IntegerType())
]))
def extract_sample_sizes(s):
    ''' Extracts sample size info from GWAS Catalog
        "INITIAL SAMPLE SIZE" field
    Returns:
        total N, N cases
    '''
    n_cases = 0
    n_controls = 0
    n_quant = 0
    for part in s.split(', '):
        # Extract sample size
        mtch = re.search('([0-9,]+)', part)
        if mtch:
            n = int(mtch.group(1).replace(',', ''))
            # Add to correct counter
            if 'cases' in part:
                n_cases += n
            elif 'controls' in part:
                n_controls += n
            else:
                n_quant += n

    # Return n_total and n_cases
    if n_quant > 0:
        return n_quant, 0
    else:
        return n_cases + n_controls, n_cases



In [109]:
# Parsing samples from 
studies_w_samples = (
    studies_df
    .withColumn('parsedSamples', extract_sample_sizes(f.col('INITIAL SAMPLE SIZE')))
    .withColumn('n_cases', f.col('parsedSamples.cases'))
    .withColumn('n_initial', f.col('parsedSamples.sampleSize'))
    .select('STUDY ACCESSION', 'n_cases', 'n_initial')
    .persist()
)

# 43810 -> 43810
studies_w_samples.select('STUDY ACCESSION').distinct().count()

AnalysisException: No such struct field cases in n_initial, n_cases

In [100]:
# Parses sample sizes from ancestry:
studies_w_samples_ancestry = (
    ancestries_df
    .groupBy('STUDY ACCESSION')
    .agg(
        f.sum('NUMBER OF INDIVIDUALS').cast(t.IntegerType()).alias('n_initial_ancestry'),
        f.sum('NUMBER OF CASES').cast(t.IntegerType()).alias('n_cases_ancestry')
    )
    .persist()
)

studies_w_samples_ancestry.count()

43943

In [101]:
(
    studies_w_samples_ancestry
    .join(studies_w_samples, on='STUDY ACCESSION', how='outer')
    .show()
)

+---------------+------------------+----------------+-------+---------+
|STUDY ACCESSION|n_initial_ancestry|n_cases_ancestry|n_cases|n_initial|
+---------------+------------------+----------------+-------+---------+
|     GCST000101|              1094|            null|      0|     1094|
|     GCST000549|              2362|            null|   2362|     2362|
|     GCST000926|              1012|            null|      0|      470|
|     GCST000994|              7128|            null|    733|     1462|
|     GCST000996|              6530|            null|   1717|     6530|
|     GCST001229|              2813|            null|    765|     1639|
|     GCST001294|              1772|            null|      0|     1772|
|     GCST001303|             12347|            null|   2315|    12347|
|     GCST001526|             85573|            null|      0|    51750|
|     GCST001565|             30518|            null|   2111|     4646|
|     GCST001602|               282|            null|    141|   

In [102]:
(
    ancestries_df
    .filter(f.col('STUDY ACCESSION') == 'GCST000994')
    .show(2, False, True)
#     .count()
)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------
 STUDY ACCESSION                         | GCST000994                                                                                                      
 INITIAL SAMPLE DESCRIPTION              | 733 Japanese ancestry cases, 729 Japanese ancestry controls                                                     
 REPLICATION SAMPLE DESCRIPTION          | 2,794 Japanese ancestry drinkers, 1,521 Japanese ancestry chance drinkers, 1,351 Japanese ancestry non-drinkers 
 STAGE                                   | replication                                                                                                     
 NUMBER OF INDIVIDUALS                   | 5666                                                                                                            
 BROAD ANCESTRAL CATEGORY                | East Asian           

In [103]:
(
    studies_df
    .filter(f.col('STUDY ACCESSION') == 'GCST000994')
    .show(2, False, True)
)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 DATE ADDED TO CATALOG       | 2011-04-07                                                                                                                                         
 PUBMED ID                   | 21372407                                                                                                                                           
 FIRST AUTHOR                | Takeuchi F                                                                                                                                         
 DATE                        | 2011-03-01                                                                                                                                         
 JOURNAL                     | Circ J                                                                    

In [107]:
# There are discrepancies between the study lists...
# Studies in study table but not in ancestry:
len(
    studies_df
    .join(ancestries_df, on='STUDY ACCESSION', how='left_anti')
    .select('STUDY ACCESSION')
    .drop_duplicates()
    .toPandas()
    ['STUDY ACCESSION']
    .to_list()
)

15

In [108]:
(
    ancestries_df
    .show(1, False, True)
)

-RECORD 0------------------------------------------------------------------------------
 STUDY ACCESSION                         | GCST009498                                  
 INITIAL SAMPLE DESCRIPTION              | up to 182,902 European ancestry individuals 
 REPLICATION SAMPLE DESCRIPTION          | NA                                          
 STAGE                                   | initial                                     
 NUMBER OF INDIVIDUALS                   | 182902                                      
 BROAD ANCESTRAL CATEGORY                | European                                    
 COUNTRY OF ORIGIN                       | NR                                          
 COUNTRY OF RECRUITMENT                  | NR                                          
 ADDITIONAL ANCESTRY DESCRIPTION         | null                                        
 ANCESTRY DESCRIPTOR                     | null                                        
 FOUNDER/GENETICALLY ISOLATED PO

In [86]:
studies_df.select('SUMMARY STATS LOCATION').show()

+----------------------+
|SUMMARY STATS LOCATION|
+----------------------+
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
|                  null|
+----------------------+
only showing top 20 rows



In [80]:
import pandas as pd
aids = (
    pd.read_csv('gwas_unpublished_ancestries_2022-05-24.tsv', sep='\t')
    .loc[lambda df: df['STUDY ACCESSION'].isin(sudy_only)]
    ['STUDY ACCESSION']
    .drop_duplicates()
    .to_list()
)

[x for x in sudy_only if x not in aids]

['FEIRI and ASKLEPIOS case control study / MayoVDB case control study / DEFINE-FMD case control study / UM/CCF-FMD case control study"']

In [84]:
(
    pd.read_csv('gwas_studies_new_2022-05-24.tsv', sep='\t')
    .loc[lambda df: df['STUDY ACCESSION'].isin(ancestry_only)]
    ['STUDY ACCESSION']
    .drop_duplicates()
    .to_list()
)

['GCST000870', 'GCST000218', 'GCST000768']

In [110]:
%%bash


grep GCST009498 gwas_ancestries_new_2022-05-27.tsv

GCST009498	31681408	Yang XL	2019-10-11	up to 182,902 European ancestry individuals	NA	initial	182902	European	NR	NR							
