In [2]:
import pandas as pd
import json
import gzip
import python_jsonschema_objects as pjo 
from collections import OrderedDict

ot_genetics_file = '/Users/dsuveges/project/issue-817_Create_genetics_evidence_json/l2g_joined_exploded.2020.03.04.parquet/'

df = pd.read_parquet(ot_genetics_file)
df.head()




Unnamed: 0,chrom,pos,ref,alt,gene_id,study_id,y_proba_full_model,pval_mantissa,pval_exponent,odds_ratio,...,pub_date,pub_author,trait_reported,sample_size,efo,most_severe_csq,csq_arr,most_severe_gene_csq,consequence_link,rsid
0,1,4544044,T,A,ENSG00000196581,GCST003128,0.757268,3.0,-9,,...,2015-09-22,Zhu Z,Adolescent idiopathic scoliosis,2459,EFO_0005423,,,intergenic_variant,http://purl.obolibrary.org/obo/SO_0001628,rs241215
1,1,7606620,C,T,ENSG00000049246,NEALE2_20002_1623,0.051663,1.39165,-8,5.784572,...,2018-08-01,UKB Neale v2,Tennis elbow / lateral epicondylitis | non-can...,361141,EFO_1001896,,,intergenic_variant,http://purl.obolibrary.org/obo/SO_0001628,rs115071930
2,1,7606620,C,T,ENSG00000049247,NEALE2_20002_1623,0.036761,1.39165,-8,5.784572,...,2018-08-01,UKB Neale v2,Tennis elbow / lateral epicondylitis | non-can...,361141,EFO_1001896,,,intergenic_variant,http://purl.obolibrary.org/obo/SO_0001628,rs115071930
3,1,7606620,C,T,ENSG00000116285,NEALE2_20002_1623,0.054864,1.39165,-8,5.784572,...,2018-08-01,UKB Neale v2,Tennis elbow / lateral epicondylitis | non-can...,361141,EFO_1001896,,,intergenic_variant,http://purl.obolibrary.org/obo/SO_0001628,rs115071930
4,1,7606620,C,T,ENSG00000049249,NEALE2_20002_1623,0.026479,1.39165,-8,5.784572,...,2018-08-01,UKB Neale v2,Tennis elbow / lateral epicondylitis | non-can...,361141,EFO_1001896,,,intergenic_variant,http://purl.obolibrary.org/obo/SO_0001628,rs115071930


In [3]:
row = df.iloc[1]
row

chrom                                                                   1
pos                                                               7606620
ref                                                                     C
alt                                                                     T
gene_id                                                   ENSG00000049246
study_id                                                NEALE2_20002_1623
y_proba_full_model                                              0.0516626
pval_mantissa                                                     1.39165
pval_exponent                                                          -8
odds_ratio                                                        5.78457
oddsr_ci_lower                                                    3.15483
oddsr_ci_upper                                                    10.6063
pmid                                                                     
pub_date                              

How the json schema looks like:

```json
{
  "properties": {
    "datasourceId": {
      "const": "ot_genetics_portal"
    },
    "confidenceIntervalLower": {
      "$ref": "#/definitions/confidenceIntervalLower"
    },
    "confidenceIntervalUpper": {
      "$ref": "#/definitions/confidenceIntervalUpper"
    },
    "datatypeId": {
      "$ref": "#/definitions/datatypeId"
    },
    "diseaseFromSource": {
      "$ref": "#/definitions/diseaseFromSource"
    },
    "diseaseFromSourceId": {
      "$ref": "#/definitions/diseaseFromSourceId"
    },
    "diseaseId": {
      "$ref": "#/definitions/diseaseId"
    },
    "literature": {
      "$ref": "#/definitions/literature"
    },
    "oddsRatio": {
      "$ref": "#/definitions/oddsRatio"
    },
    "pValueExponent": {
      "$ref": "#/definitions/pValueExponent"
    },
    "pValueMantissa": {
      "$ref": "#/definitions/pValueMantissa"
    },
    "publicationFirstAuthor": {
      "$ref": "#/definitions/publicationFirstAuthor"
    },
    "publicationYear": {
      "$ref": "#/definitions/publicationYear"
    },
    "resourceScore": {
      "$ref": "#/definitions/resourceScore"
    },
    "studyId": {
      "$ref": "#/definitions/studyId"
    },
    "studySampleSize": {
      "$ref": "#/definitions/studySampleSize"
    },
    "targetFromSourceId": {
      "$ref": "#/definitions/targetFromSourceId"
    },
    "variantFunctionalConsequenceId": {
      "$ref": "#/definitions/variantFunctionalConsequenceId"
    },
    "variantId": {
      "$ref": "#/definitions/variantId"
    },
    "variantRsId": {
      "$ref": "#/definitions/variantRsId"
    }
  },
  "required": [
    "datasourceId",
    "targetFromSourceId",
    "diseaseId"
  ],
  "additionalProperties": false
}
```

The following fields are required for `ot_genetics_portal`:
* `targetFromSourceId`: `gene_id`
* `diseaseId`: `efo`
* `datasourceId`: `ot_genetics_portal`

The following fields are also provided:
* `variantRsId`: `rsid`
* `variantId`: `chrom`, `pos`, `alt`, `ref` concatenated into a single sting
* etc.etc.

In [1]:
from datetime import datetime
import argparse
import os
import sys
import pyspark.sql
from pyspark.sql.types import *
from pyspark.sql.functions import *

global spark
spark = (pyspark.sql.SparkSession.builder.getOrCreate())
print('Spark version: ', spark.version)

Spark version:  3.0.0


In [5]:
df = (
    spark.read.parquet(ot_genetics_file)
    .withColumn('literature', when(col('pmid')!='', array(regexp_extract(col('pmid'), "PMID:(\d+)$",1))).otherwise(None))
    .select(
        lit('ot_genetics_portal').alias('datasourceId'),
        lit('genetic_association').alias('datatypeId'),
        col('gene_id').alias('targetFromSourceId'),
        col('efo').alias('diseaseFromSourceMappedId'),
        col('literature'),
        col('pub_author').alias('publicationFirstAuthor'),
        substring(col('pub_date'), 1, 4).cast(IntegerType()).alias('publicationYear'),
        col('trait_reported').alias('diseaseFromSource'),
        col('study_id').alias('studyId'),
        col('sample_size').alias('studySampleSize'),
        col('pval_mantissa').alias('pValueMantissa'),
        col('pval_exponent').alias('pValueExponent'),
        col('odds_ratio').alias('oddsRatio'),
        col('oddsr_ci_lower').alias('confidenceIntervalLower'),
        col('oddsr_ci_upper').alias('confidenceIntervalUpper'),
        col('y_proba_full_model').alias('resourceScore'),
        col('rsid').alias('variantRsId'),
        concat_ws('_', col('chrom'),col('pos'),col('alt'),col('ref')).alias('variantId'),
        regexp_extract(col('consequence_link'), "\/(SO.+)$",1).alias('variantFunctionalConsequenceId')
    )
    .orderBy(rand())
    .limit(500)
    .write.format('json').mode('overwrite').option("compression", "org.apache.hadoop.io.compress.GzipCodec")
    .save('cicaful3.json.gz')
)
#     .
# .write.format('json').save('cicaful2.json')

In [139]:
import warlock
import json

In [140]:
with open('/Users/dsuveges/repositories/json_schema/opentargets.json', 'r') as f:
    schema = json.load(f)

In [None]:
schema =   {"$schema": "http://json-schema.org/draft-07/schema#",
  "title": "OpenTargets",
  "type": "object",
  "oneOf": [
      {
          'properties':{
              
          }
      },
      {
          
      }
  ]
}


In [142]:
evidence = warlock.model_factory(schema)

In [145]:
sweden = evidence(datasourceId='ot_genetics_portal')

ValueError: 'cancer_gene_census' was expected

Failed validating 'const' in schema[0]['properties']['datasourceId']:
    {'const': 'cancer_gene_census'}

On instance['datasourceId']:
    'ot_genetics_portal'