In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, udf, lit, count, size, regexp_extract
from pyspark.sql import Row
from pyspark.sql.types import StringType, IntegerType, TimestampType, StructType

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

geneticsPortal='/Users/dsuveges/repositories/evidence_datasource_parsers/genetics_portal-2021-06-10.json.gz'



In [8]:

evidence_files = {
#     'old_evidence': '/Users/dsuveges/project_data/ot/evidence_input/21.04/geneticsprotal/genetics_portal_evidence_2021.04.13/',
#     #'new_evidence 06.10': '/Users/dsuveges/repositories/evidence_datasource_parsers/genetics_portal-2021-06-10.json.gz',
#     'new_evidence 06.17': '/Users/dsuveges/repositories/evidence_datasource_parsers/genetics_portal-2021-06-17/',
    'new_evidence 06.18': '/Users/dsuveges/repositories/evidence_datasource_parsers/genetics_portal-2021-06-18/'
}


all_data = {}

for label, file in evidence_files.items():
    geneticsPortal_df = None
    print(file)
    geneticsPortal_df = (
        spark.read.json(file)
        .persist()
    )
    
    data = {
        # # associations
        "Evidence count": geneticsPortal_df.count(),
        
        # 
        'Association count': (
            geneticsPortal_df
            .filter(col('diseaseFromSourceMappedId').isNotNull())
            .select('diseaseFromSourceMappedId', 'targetFromSourceId')
            .distinct()
            .count()
        ),
        
        # # study
        "Study count": geneticsPortal_df.select('studyId').distinct().count(),

        # # target
        "Target count":  geneticsPortal_df.select('targetFromSourceId').distinct().count(),

        # # target
        "Variant count":  geneticsPortal_df.select('variantId').distinct().count(),

        # # disease
        "Disease count": (
            geneticsPortal_df
            .filter(col('diseaseFromSourceMappedId').isNotNull())
            .select('diseaseFromSourceMappedId')
            .distinct()
            .count()
        ),

        # # Evidence without EFO identifiers:
        "Evidence with unmapped diseases": (
            geneticsPortal_df
            .filter(col('diseaseFromSourceMappedId').isNull())
            .count()
        ),

        
        ## Studies without EFO identifiers:
        "Studies with unmapped diseases": (
            geneticsPortal_df
            .filter(col('diseaseFromSourceMappedId').isNull())
            .select('studyId')
            .distinct()
            .count()
        ),

        # # GWAS association (study_id/variant)
        "GWAS association": (
            geneticsPortal_df
            .select('studyId', 'variantId')
            .distinct()
            .count()
        )
        
        # # GWAS association (study_id/variant)
#         "min. l2g score": (
#             geneticsPortal_df
#             .select('studyId', 'variantId')
#             .distinct()
#             .count()
#         )
    }
    
    all_data[label] = data

all_data

/Users/dsuveges/repositories/evidence_datasource_parsers/genetics_portal-2021-06-18/


{'new_evidence 06.18': {'Evidence count': 635033,
  'Association count': 279578,
  'Study count': 16429,
  'Target count': 18193,
  'Variant count': 107017,
  'Disease count': 3130,
  'Evidence with unmapped diseases': 20519,
  'Studies with unmapped diseases': 322,
  'GWAS association': 220437}}

In [9]:
import pandas as pd 

print(pd.DataFrame(all_data).to_markdown())

|                                 |   new_evidence 06.18 |
|:--------------------------------|---------------------:|
| Association count               |               279578 |
| Disease count                   |                 3130 |
| Evidence count                  |               635033 |
| Evidence with unmapped diseases |                20519 |
| GWAS association                |               220437 |
| Studies with unmapped diseases  |                  322 |
| Study count                     |                16429 |
| Target count                    |                18193 |
| Variant count                   |               107017 |


In [None]:
evidence

In [12]:
# geneticsPortal_df = (
#     spark.read.json('/Users/dsuveges/repositories/evidence_datasource_parsers/genetics_portal-2021-06-17/')
#     .persist()
# )

(
    geneticsPortal_df
    .filter(
        col('diseaseFromSourceMappedId').isNull()
        & col('studyId').rlike('FINN')
    )
    .select(col('studyId'))
    .distinct()
    .show(truncate=False)
)

+-----------------------------------------------------------+
|studyId                                                    |
+-----------------------------------------------------------+
|FINNGEN_R5_Z21_PROBL_RELATED_LIFE_M_DIFFICULTY             |
|FINNGEN_R5_AB1_SEPSIS                                      |
|FINNGEN_R5_Z21_PROBL_RELATED_CARE__DEPENDENCY              |
|FINNGEN_R5_Z21_ANKLE_FOOT                                  |
|FINNGEN_R5_Z21_NEED_IMMUNI_OTH_SINGLE_VIRAL_DISEA          |
|FINNGEN_R5_Z21_PRESENCE_OTH_DEVICES                        |
|FINNGEN_R5_Z21_PROCREATIVE_MANAG                           |
|FINNGEN_R5_M13_ARTHROSIS_INCLAVO                           |
|FINNGEN_R5_Z21_FIT_ADJ_OTH_DEVICES                         |
|FINNGEN_R5_H7_BULLKERATOPATHY                              |
|FINNGEN_R5_M13_ATLOAXSUBLUX                                |
|FINNGEN_R5_ST19_INJURY_BLOOD_VESSE_WRIST_HAND_LEVEL        |
|FINNGEN_R5_K11_REIMB_202                                   |
|FINNGEN

In [None]:
(
    geneticsPortal_df
    .filter(
        col('diseaseFromSourceMappedId').isNotNull()
        & ~col('studyId').rlike('FINN')
    )
    .select(col('studyId'))
    .distinct()
    .show()
)

In [17]:
studies = (
    spark.read.json('/Users/dsuveges/repositories/evidence_datasource_parsers/study-index')
    .persist()
)

In [43]:
(
    studies
    .filter(col('study_id').rlike('FINNGEN_R5_H7_BULL'))
    .show(vertical=True, truncate=False)
)

(0 rows)



In [31]:
(
    studies
    .filter(col('study_id').rlike('FINNGEN') & (size(col('trait_efos')) == 0))
    .distinct()
    .count()
#     .show(vertical=True, truncate=False)
)

138

In [36]:
study_list = [
    "FINNGEN_R5_Z21_PROBL_RELATED_LIFE_M_DIFFICULTY",
    "FINNGEN_R5_AB1_SEPSIS",
    "FINNGEN_R5_Z21_PROBL_RELATED_CARE__DEPENDENCY",
    "FINNGEN_R5_Z21_ANKLE_FOOT",
    "FINNGEN_R5_Z21_NEED_IMMUNI_OTH_SINGLE_VIRAL_DISEA",
    "FINNGEN_R5_Z21_PRESENCE_OTH_DEVICES",
    "FINNGEN_R5_Z21_PROCREATIVE_MANAG",
    "FINNGEN_R5_M13_ARTHROSIS_INCLAVO",
    "FINNGEN_R5_Z21_FIT_ADJ_OTH_DEVICES",
    "FINNGEN_R5_H7_BULLKERATOPATHY",
    "FINNGEN_R5_M13_ATLOAXSUBLUX",
    "FINNGEN_R5_ST19_INJURY_BLOOD_VESSE_WRIST_HAND_LEVEL",
    "FINNGEN_R5_K11_REIMB_202",
    "FINNGEN_R5_ST19_MALTR_SYNDR",
    "FINNGEN_R5_Z21_FOLLOW__EXAM_TREAT_CONDI_OTH_MALIG_NEOPLASMS",
    "FINNGEN_R5_CD2_PRIMARY_LYMPHOID_HEMATOPOIETIC",
    "FINNGEN_R5_R18_SENILITY",
    "FINNGEN_R5_I9_HEARTFAIL_AND_CHD"
]
(
    studies
    .filter(studies.study_id.isin(study_list))
    .distinct()
    .show(1, vertical=True, truncate=False)
)

-RECORD 0-------------------------------------------------------
 ancestry_initial     | [European=198179.0]                     
 ancestry_replication | []                                      
 has_sumstats         | true                                    
 n_cases              | 4609                                    
 n_initial            | 198179                                  
 n_replication        | 0                                       
 num_assoc_loci       | 1                                       
 pmid                 | null                                    
 pub_author           | FINNGEN_R5                              
 pub_date             | 2018-03-01                              
 pub_journal          | null                                    
 pub_title            | null                                    
 source               | FINNGEN                                 
 study_id             | FINNGEN_R5_Z21_FIT_ADJ_OTH_DEVICES      
 trait_category       | n

In [34]:
(
    studies
    .distinct()
    .count()
)

33758

In [48]:
geneticsPortal_df = (
    spark.read.json('/Users/dsuveges/repositories/evidence_datasource_parsers/genetics_portal-2021-06-18/')
    .persist()
)


In [13]:
(
    geneticsPortal_df
    .filter(
        col('diseaseFromSourceMappedId').isNotNull()
        & col('diseaseFromSource').isNull()
    )
    .count()
)

0

In [14]:
(
    geneticsPortal_df
    .filter(
        col('projectId') == 'GCST'
    )
    .show(2, vertical=True, truncate=False)
)

-RECORD 0-----------------------------------------------------------------------------
 beta                             | -0.0291                                           
 betaConfidenceIntervalLower      | -0.0380660341860385                               
 betaConfidenceIntervalUpper      | -0.0201339658139614                               
 datasourceId                     | ot_genetics_portal                                
 datatypeId                       | genetic_association                               
 diseaseFromSource                | Type 2 diabetes                                   
 diseaseFromSourceMappedId        | EFO_0001360                                       
 literature                       | [32541925]                                        
 oddsRatio                        | null                                              
 oddsRatioConfidenceIntervalLower | null                                              
 oddsRatioConfidenceIntervalUpper | null   

In [45]:
pattern = r"^([A-Z]+)"
(
    geneticsPortal_df
    .filter(
        col('diseaseFromSourceMappedId').isNull()
    )
    .withColumn('projectId', regexp_extract(col("studyId"), pattern, 1))
    .select('studyId', 'projectId')
    .distinct()
    .groupby('projectId')
    .count()
    .show()
)

+---------+-----+
|projectId|count|
+---------+-----+
|    NEALE|  256|
|    SAIGE|   50|
|  FINNGEN|   13|
|     GCST|    3|
+---------+-----+



In [47]:
(
    geneticsPortal_df
    .filter(
        (col('diseaseFromSourceMappedId').isNull())
        &(col('studyId').rlike('FINNGEN'))
    )
    .select('studyId')
    .distinct()
    .show(20, truncate=False)
)

+-----------------------------------------------------------+
|studyId                                                    |
+-----------------------------------------------------------+
|FINNGEN_R5_Z21_PROBL_RELATED_LIFE_M_DIFFICULTY             |
|FINNGEN_R5_Z21_PROBL_RELATED_CARE__DEPENDENCY              |
|FINNGEN_R5_Z21_ANKLE_FOOT                                  |
|FINNGEN_R5_Z21_NEED_IMMUNI_OTH_SINGLE_VIRAL_DISEA          |
|FINNGEN_R5_Z21_PRESENCE_OTH_DEVICES                        |
|FINNGEN_R5_Z21_PROCREATIVE_MANAG                           |
|FINNGEN_R5_Z21_FIT_ADJ_OTH_DEVICES                         |
|FINNGEN_R5_M13_ATLOAXSUBLUX                                |
|FINNGEN_R5_ST19_INJURY_BLOOD_VESSE_WRIST_HAND_LEVEL        |
|FINNGEN_R5_K11_REIMB_202                                   |
|FINNGEN_R5_ST19_MALTR_SYNDR                                |
|FINNGEN_R5_Z21_FOLLOW__EXAM_TREAT_CONDI_OTH_MALIG_NEOPLASMS|
|FINNGEN_R5_CD2_PRIMARY_LYMPHOID_HEMATOPOIETIC              |
+-------

In [51]:
(
    geneticsPortal_df
    .filter(
        col('projectId').isNull()
    )
    .select('studyId')
    .distinct()
    .show(20, truncate=False)
)

+---------------+
|studyId        |
+---------------+
|SAIGE_735_2    |
|NEALE2_2110    |
|NEALE2_1150_2  |
|SAIGE_270_3    |
|NEALE2_6143_1  |
|NEALE2_4277    |
|NEALE2_6149_100|
|NEALE2_20551_3 |
|NEALE2_5983_raw|
|NEALE2_20552_1 |
|NEALE2_1797    |
|NEALE2_924     |
|NEALE2_41219_1 |
|NEALE2_3637    |
|NEALE2_5001    |
|SAIGE_350_1    |
|NEALE2_6158_2  |
|NEALE2_4990    |
|NEALE2_20526   |
|SAIGE_516_1    |
+---------------+
only showing top 20 rows

