In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, udf, lit, size
from pyspark.sql.types import StringType, IntegerType, TimestampType, StructType

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)


geneticsportal_data = '/Users/dsuveges/repositories/evidence_datasource_parsers/genetics_portal-2021-06-21/'

genetics_df = (
    spark.read.json(geneticsportal_data)
    .persist()
)

genetics_df.show(2, vertical=True, truncate=False)

-RECORD 0-----------------------------------------------------------------------------
 beta                             | -0.0291                                           
 betaConfidenceIntervalLower      | -0.0380660341860385                               
 betaConfidenceIntervalUpper      | -0.0201339658139614                               
 datasourceId                     | ot_genetics_portal                                
 datatypeId                       | genetic_association                               
 diseaseFromSource                | Type 2 diabetes                                   
 diseaseFromSourceMappedId        | EFO_0001360                                       
 literature                       | [32541925]                                        
 oddsRatio                        | null                                              
 oddsRatioConfidenceIntervalLower | null                                              
 oddsRatioConfidenceIntervalUpper | null   

In [3]:
print(f'Number of evidence: {genetics_df.count()}')
print(f'Number of disease mapped: {genetics_df.select("diseaseFromSourceMappedId").distinct().count()}')
print(f'Number of targets: {genetics_df.select("targetFromSourceId").distinct().count()}')
print(f'Number of associations: {genetics_df.select("targetFromSourceId", "diseaseFromSourceMappedId").distinct().count()}')
print(f'Number of studies: {genetics_df.select("studyId").distinct().count()}')


Number of evidence: 635033
Number of disease mapped: 3134
Number of targets: 18193
Number of associations: 287095
Number of studies: 16429


In [6]:
(
    genetics_df
    .filter(col('diseaseFromSourceMappedId').isNull())
    .count()
)

20501

In [9]:
(
    genetics_df
    .filter(col('projectId').isNull())
    .count()
)

20501