In [59]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, udf, lit
from pyspark.sql import Row
from pyspark.sql.types import StringType, IntegerType, TimestampType, StructType

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

geneticsPortal='/Users/dsuveges/repositories/evidence_datasource_parsers/genetics_portal-2021-06-10.json.gz'

geneticsPortal_df = (
    spark.read.json(geneticsPortal)
    .persist()
)

# Show the first two entries:
geneticsPortal_df.show(2, vertical=True, truncate=False)

In [2]:
geneticsPortal_df.count()

632693

In [4]:
geneticsPortal_df.select('studyId').distinct().count()

16429

In [7]:
y = (
    geneticsPortal_df
    .filter(col('diseaseFromSourceMappedId').isNull())
    .count()
)

y

20426

In [6]:
(
    geneticsPortal_df
    .filter(col('diseaseFromSourceMappedId').isNotNull())
    .select('diseaseFromSourceMappedId')
    .distinct()
    .count()
)

3176

In [12]:
data = {
    # # associations
    "Association count": geneticsPortal_df.count(),
    
    # # study
    "Study count": geneticsPortal_df.select('studyId').distinct().count(),
    
    # # target
    "Target count":  geneticsPortal_df.select('targetFromSourceId').distinct().count(),
    
    # # target
    "Variant count":  geneticsPortal_df.select('variantId').distinct().count(),
    
    # # disease
    "Disease count": (
        geneticsPortal_df
        .filter(col('diseaseFromSourceMappedId').isNotNull())
        .select('diseaseFromSourceMappedId')
        .distinct()
        .count()
    ),
    
    # # Associations without EFO identifiers:
    "Evidence with unmapped diseases": (
        geneticsPortal_df
        .filter(col('diseaseFromSourceMappedId').isNull())
        .count()
    ),
    
    # # Associations without EFO identifiers:
    "Unmapped diseases": (
        geneticsPortal_df
        .filter(col('diseaseFromSourceMappedId').isNull())
        .select('diseaseFromSource')
        .distinct()
        .count()
    ),
    
    # # association
    "Association count": (
        geneticsPortal_df
        .select('studyId', 'variantId')
        .distinct()
        .count()
    ),

    # # GWAS association (study_id/variant)
    "GWAS association": (
        geneticsPortal_df
        .select('studyId', 'variantId')
        .distinct()
        .count()
    )
}

data

{'Association count': 220402,
 'Study count': 16429,
 'Target count': 18183,
 'Variant count': 107002,
 'Disease count': 3176,
 'Evidence with unmapped diseases': 20426,
 'Unmapped diseases': 1,
 'GWAS association': 220402}

In [30]:

evidence_files = {
    'old_evidence': '/Users/dsuveges/project_data/ot/evidence_input/21.04/geneticsprotal/genetics_portal_evidence_2021.04.13/',
    'new_evidence': '/Users/dsuveges/repositories/evidence_datasource_parsers/genetics_portal-2021-06-10.json.gz'
}


all_data = {}

for label, file in evidence_files.items():
    print(file)
    geneticsPortal_df = (
        spark.read.json(file)
        .persist()
    )
    
    data = {
        # # associations
        "Association count": geneticsPortal_df.count(),

        # # study
        "Study count": geneticsPortal_df.select('studyId').distinct().count(),

        # # target
        "Target count":  geneticsPortal_df.select('targetFromSourceId').distinct().count(),

        # # target
        "Variant count":  geneticsPortal_df.select('variantId').distinct().count(),

        # # disease
        "Disease count": (
            geneticsPortal_df
            .filter(col('diseaseFromSourceMappedId').isNotNull())
            .select('diseaseFromSourceMappedId')
            .distinct()
            .count()
        ),

        # # Associations without EFO identifiers:
        "Evidence with unmapped diseases": (
            geneticsPortal_df
            .filter(col('diseaseFromSourceMappedId').isNull())
            .count()
        ),

        # # Associations without EFO identifiers:
        "Unmapped diseases": (
            geneticsPortal_df
            .filter(col('diseaseFromSourceMappedId').isNull())
            .select('diseaseFromSource')
            .distinct()
            .count()
        ),

        # # association
        "Association count": (
            geneticsPortal_df
            .select('studyId', 'variantId')
            .distinct()
            .count()
        ),

        # # GWAS association (study_id/variant)
        "GWAS association": (
            geneticsPortal_df
            .select('studyId', 'variantId')
            .distinct()
            .count()
        )
    }
    
    all_data[label] = data

all_data

/Users/dsuveges/project_data/ot/evidence_input/21.04/geneticsprotal/genetics_portal_evidence_2021.04.13/
/Users/dsuveges/repositories/evidence_datasource_parsers/genetics_portal-2021-06-10.json.gz


{'old_evidence': {'Association count': 135721,
  'Study count': 11038,
  'Target count': 17300,
  'Variant count': 71498,
  'Disease count': 2291,
  'Evidence with unmapped diseases': 0,
  'Unmapped diseases': 0,
  'GWAS association': 135721},
 'new_evidence': {'Association count': 220402,
  'Study count': 16429,
  'Target count': 18183,
  'Variant count': 107002,
  'Disease count': 3176,
  'Evidence with unmapped diseases': 20426,
  'Unmapped diseases': 1,
  'GWAS association': 220402}}

In [33]:
import pandas as pd 

print(pd.DataFrame(all_data).to_markdown())

|                                 |   old_evidence |   new_evidence |
|:--------------------------------|---------------:|---------------:|
| Association count               |         135721 |         220402 |
| Study count                     |          11038 |          16429 |
| Target count                    |          17300 |          18183 |
| Variant count                   |          71498 |         107002 |
| Disease count                   |           2291 |           3176 |
| Evidence with unmapped diseases |              0 |          20426 |
| Unmapped diseases               |              0 |              1 |
| GWAS association                |         135721 |         220402 |


In [23]:
evidence

'/Users/dsuveges/project_data/ot_genetics/cicaful8.parquet'

In [56]:
(
    geneticsPortal_df
    .filter(
        col('diseaseFromSourceMappedId').isNull()
    )
    .select(col('studyId')) #,col('diseaseFromSourceMappedId'), col('diseaseFromSource'))
    .distinct()
    .withColumn('study_stem', split(col('studyId'), '_').getItem(0))
    .groupby(col('study_stem'))
    .count()
    .show()
)

+------------+-----+
|  study_stem|count|
+------------+-----+
|  GCST011378|    1|
|      NEALE2|  256|
|       SAIGE|   50|
|     FINNGEN|    7|
|  GCST010653|    1|
|GCST90013791|    1|
|  GCST010729|    1|
+------------+-----+



In [82]:
from pyspark.sql.functions import regexp_extract, regexp_replace, substring

mylist = [
    {"type_activity_id":1,"type_activity_name":"xxx", 'extra_label': 'cirmos23cica23523hajj'},
    {"type_activity_id":2,"type_activity_name":"yyy", 'extra_label': '- hova232lett2325a'},
    {"type_activity_id":3,"type_activity_name":"zzz", 'extra_label': 'vaj'}
]

(
    spark
    .createDataFrame(Row(**x) for x in mylist)
    .withColumn('words',  regexp_extract_all(col('extra_label'), r'(\D+)', 0))
#     .filte%colorscol('extra_label').rlike(r'lett'))
    .withColumn('phenotype', regexp_replace(col('extra_label'), r'[^0-9a-zA-Z -]', ''))
    .show()
)

NameError: name 'regexp_extract_all' is not defined

In [90]:
data = [("James","M",60000), ("Michael","M",70000),
        ("Robert",None,400000), ("Maria","F",500000),
        ("Jen","",None)]

columns = ["name","gender","salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.show()

#Using When otherwise
from pyspark.sql.functions import when,col
df2 = df.withColumn("new_gender", when(df.gender == "M","Male")
                                 .when(df.gender == "F","Female")
                                 .when(df.gender.isNull(), None)
                                 .otherwise(None))
df2.show()


+-------+------+------+
|   name|gender|salary|
+-------+------+------+
|  James|     M| 60000|
|Michael|     M| 70000|
| Robert|  null|400000|
|  Maria|     F|500000|
|    Jen|      |  null|
+-------+------+------+

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|      null|
|  Maria|     F|500000|    Female|
|    Jen|      |  null|      null|
+-------+------+------+----------+



In [97]:
(
    geneticsPortal_df
    .filter(col('studyId').rlike('FINNGEN'))
    .withColumn('isMapped', 
                when(col('diseaseFromSourceMappedId').isNotNull(), 'Yes')
                .otherwise('No')
     )
    .select('studyId', 'isMapped')
    .distinct()
    .groupby(col('isMapped'))
    .count()
    .show()
)

+--------+-----+
|isMapped|count|
+--------+-----+
|      No|    7|
|     Yes| 1221|
+--------+-----+



In [100]:
(
    geneticsPortal_df
    .filter(
        col('studyId').rlike('FINNGEN') &
        col('diseaseFromSourceMappedId').isNotNull()
    )
    .select('diseaseFromSource', 'diseaseFromSourceMappedId', 'studyId')
    .distinct()
    .show(10, truncate=False)
)

+--------------------------------------------------------------------------+-------------------------+-------------------------------+
|diseaseFromSource                                                         |diseaseFromSourceMappedId|studyId                        |
+--------------------------------------------------------------------------+-------------------------+-------------------------------+
|Cardiomyopathy                                                            |EFO_0000318              |FINNGEN_R5_I9_CARDMYO          |
|Other acquired deformities of musculoskeletal system and connective tissue|EFO_0009676              |FINNGEN_R5_M13_DEFORMACQ       |
|Reactive arthropathies                                                    |EFO_0007460              |FINNGEN_R5_M13_REACTARTH       |
|Type 1 diabetes with renal complications                                  |EFO_0004996              |FINNGEN_R5_E4_DM1REN           |
|Valgus deformity, not elsewhere classified            

In [103]:
(
    geneticsPortal_df
    .filter(
        col('studyId').rlike('FINNGEN') &
        col('diseaseFromSourceMappedId').isNotNull()
    )
    .select('studyId', 'variantID')
    .distinct()
    .count()
)

5492

In [104]:
(
    geneticsPortal_df
    .filter(
        col('studyId').rlike('FINNGEN') &
        col('diseaseFromSourceMappedId').isNull()
    )
    .select('diseaseFromSource', 'diseaseFromSourceMappedId', 'studyId')
    .distinct()
    .show(10, truncate=False)
)

+-----------------+-------------------------+--------------------------------+
|diseaseFromSource|diseaseFromSourceMappedId|studyId                         |
+-----------------+-------------------------+--------------------------------+
|null             |null                     |FINNGEN_R5_AB1_SEPSIS           |
|null             |null                     |FINNGEN_R5_LUNG_TRANSPLANTATION |
|null             |null                     |FINNGEN_R5_M13_ARTHROSIS_INCLAVO|
|null             |null                     |FINNGEN_R5_H7_BULLKERATOPATHY   |
|null             |null                     |FINNGEN_R5_K11_REIMB_202        |
|null             |null                     |FINNGEN_R5_R18_SENILITY         |
|null             |null                     |FINNGEN_R5_I9_HEARTFAIL_AND_CHD |
+-----------------+-------------------------+--------------------------------+



In [107]:
from pyspark.sql.functions import max as py_max

(
    geneticsPortal_df
    .agg({'pValueExponent': 'max'}).show()
    
)

# geneticsPortal_df.show(1, vertical=True, truncate=False)

+-------------------+
|max(pValueExponent)|
+-------------------+
|                 -8|
+-------------------+



In [110]:
spark.version

'3.0.0'