# New Gene2Phenotype dataset

Comparing dataset for changes in schema:

* New dataset: `https://www.ebi.ac.uk/gene2phenotype/downloads` 
* Old dataset: `gs://otar000-evidence_input/Gene2Phenotype/data_files`

In [2]:
%%bash

NEWDIR='/Users/dsuveges/project_data/gene2phenotype/new'
OLDDIR='/Users/dsuveges/project_data/gene2phenotype/old'

mkdir -p ${NEWDIR}
mkdir -p ${OLDDIR}


# Fetching new dataset:
wget https://www.ebi.ac.uk/gene2phenotype/downloads/CancerG2P.csv.gz -O ${NEWDIR}/CancerG2P.csv.gz
wget https://www.ebi.ac.uk/gene2phenotype/downloads/DDG2P.csv.gz -O ${NEWDIR}/DDG2P.csv.gz
wget https://www.ebi.ac.uk/gene2phenotype/downloads/EyeG2P.csv.gz -O ${NEWDIR}/EyeG2P.csv.gz
wget https://www.ebi.ac.uk/gene2phenotype/downloads/SkinG2P.csv.gz -O ${NEWDIR}/SkinG2P.csv.gz

ls -lah ${NEWDIR}

# Fetching old data:
for dataset in CancerG2P DDG2P EyeG2P SkinG2P; do
    echo "Fetching: ${dataset}"
    gsutil cp -r gs://otar000-evidence_input/Gene2Phenotype/data_files/${dataset}-2021-11-02.csv.gz ${OLDDIR}
done

ls -lah ${OLDDIR}


total 920
drwxr-xr-x  6 dsuveges  384566875   192B 10 Dec 13:42 .
drwxr-xr-x  4 dsuveges  384566875   128B 10 Dec 13:42 ..
-rw-r--r--  1 dsuveges  384566875   8.7K 10 Dec 13:43 CancerG2P.csv.gz
-rw-r--r--  1 dsuveges  384566875   219K 10 Dec 13:43 DDG2P.csv.gz
-rw-r--r--  1 dsuveges  384566875    70K 10 Dec 13:43 EyeG2P.csv.gz
-rw-r--r--  1 dsuveges  384566875    53K 10 Dec 13:43 SkinG2P.csv.gz
Fetching: CancerG2P
Fetching: DDG2P
Fetching: EyeG2P
Fetching: SkinG2P
total 720
drwxr-xr-x  6 dsuveges  384566875   192B 10 Dec 13:44 .
drwxr-xr-x  4 dsuveges  384566875   128B 10 Dec 13:42 ..
-rw-r--r--  1 dsuveges  384566875   8.8K 10 Dec 13:43 CancerG2P-2021-11-02.csv.gz
-rw-r--r--  1 dsuveges  384566875   218K 10 Dec 13:44 DDG2P-2021-11-02.csv.gz
-rw-r--r--  1 dsuveges  384566875    70K 10 Dec 13:44 EyeG2P-2021-11-02.csv.gz
-rw-r--r--  1 dsuveges  384566875    54K 10 Dec 13:44 SkinG2P-2021-11-02.csv.gz


--2021-12-10 13:43:54--  https://www.ebi.ac.uk/gene2phenotype/downloads/CancerG2P.csv.gz
Resolving www.ebi.ac.uk (www.ebi.ac.uk)... 193.62.193.80
Connecting to www.ebi.ac.uk (www.ebi.ac.uk)|193.62.193.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8933 (8.7K) [application/zip]
Saving to: ‘/Users/dsuveges/project_data/gene2phenotype/new/CancerG2P.csv.gz’

     0K ........                                              100% 40.6M=0s

2021-12-10 13:43:55 (40.6 MB/s) - ‘/Users/dsuveges/project_data/gene2phenotype/new/CancerG2P.csv.gz’ saved [8933/8933]

--2021-12-10 13:43:55--  https://www.ebi.ac.uk/gene2phenotype/downloads/DDG2P.csv.gz
Resolving www.ebi.ac.uk (www.ebi.ac.uk)... 193.62.193.80
Connecting to www.ebi.ac.uk (www.ebi.ac.uk)|193.62.193.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 224653 (219K) [application/zip]
Saving to: ‘/Users/dsuveges/project_data/gene2phenotype/new/DDG2P.csv.gz’

     0K .......... .......... .....

In [30]:
from pyspark.sql.functions import (
    col, udf, struct, lit, split, regexp_replace, create_map, min as spark_min, max as spark_max,
    count, sum as spar_sum, explode, when
)
from pyspark.sql.types import (
    FloatType, ArrayType, StructType, StructField, StringType, IntegerType, TimestampType
)
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from itertools import chain


# for heatmap:
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt

# establish spark connection
sparkConf = (
    SparkConf()
    .set('spark.driver.memory', '15g')
    .set('spark.executor.memory', '15g')
    .set('spark.driver.maxResultSize', '0')
)
spark = (
    SparkSession.builder
    .config(conf=sparkConf)
    .master('local[*]')
    .getOrCreate()
)

In [9]:
file_types = ["CancerG2P", "DDG2P", "EyeG2P", "SkinG2P"]
paths = {
    'old': '/Users/dsuveges/project_data/gene2phenotype/old/{}-2021-11-02.csv.gz',
    'new': '/Users/dsuveges/project_data/gene2phenotype/new/{}.csv.gz',
}
for file_type in file_types:
    print(f'looking at the schema of {file_type}')
    for key, value in paths.items():
        print(f'Opening {key} dataset:')
        print(spark.read.csv(value.format(file_type), header=True).printSchema())


looking at the schema of CancerG2P
OPpening old dataset:
root
 |-- gene symbol: string (nullable = true)
 |-- gene mim: string (nullable = true)
 |-- disease name: string (nullable = true)
 |-- disease mim: string (nullable = true)
 |-- DDD category: string (nullable = true)
 |-- allelic requirement: string (nullable = true)
 |-- mutation consequence: string (nullable = true)
 |-- phenotypes: string (nullable = true)
 |-- organ specificity list: string (nullable = true)
 |-- pmids: string (nullable = true)
 |-- panel: string (nullable = true)
 |-- prev symbols: string (nullable = true)
 |-- hgnc id: string (nullable = true)
 |-- gene disease pair entry date: string (nullable = true)

None
OPpening new dataset:
root
 |-- gene symbol: string (nullable = true)
 |-- gene mim: string (nullable = true)
 |-- disease name: string (nullable = true)
 |-- disease mim: string (nullable = true)
 |-- confidence category: string (nullable = true)
 |-- allelic requirement: string (nullable = true)
 |-

In [10]:
old_skin = (
    spark.read.csv('/Users/dsuveges/project_data/gene2phenotype/old/SkinG2P-2021-11-02.csv.gz', header=True)
    .persist()
)

new_skin = (
    spark.read.csv('/Users/dsuveges/project_data/gene2phenotype/new/SkinG2P.csv.gz', header=True)
    .persist()
)

print(old_skin.show(1, vertical=True, truncate=False))
print(new_skin.show(1, vertical=True, truncate=False))

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 gene symbol                  | RAB27A                                                                                                                                                               
 gene mim                     | 603868                                                                                                                                                               
 disease name                 | Griscelli Type 2                                                                                                                                                     
 disease mim                  | No disease mim                                                                                                                                                       
 DDD categ

### Fields:

| Old | New | Description |
|:----|:----|:------------|
|gene symbol | gene symbol | HGNC gene symbol | 
|gene mim | gene mim | gene identifier in the OMIM database |
|disease name | disease name| disease name in the OMIM database |
|disease mim | disease mim | disease identifier in the OMIM database |
|DDD category | confidence category | confidence category of the annotation |
|allelic requirement | allelic requirement | allelic requirement | 
|mutation consequence | mutation consequence | consequence |
|phenotypes | phenotypes | list of phenotype annotations in the human phenotypes onotology |
|organ specificity list| organ specificity list| |
|pmids | pmids | list of pmids |
| panel | panel | |
|prev symbols | prev symbols | |
|hgnc id | hgnc id | gene identifier in the hgnc database |
|gene disease pair entry date | gene disease pair entry date ||
| | cross cutting modifier |
| | mutation consequence flag |


* `cross cutting modifier`: only six different evidence has this annotation in the skin dataset.
    * `requires heterozygosity`:
    * `typically mosaic `:
    * `imprinted`:
* `mutation consequence flag`: 343 non-null rows,
    * `restricted repertoire of mutations`

In [18]:
(
    new_skin
    .filter(col('cross cutting modifier').isNotNull())
    .distinct()
    .show()
)

+-----------+--------+--------------------+--------------+-------------------+--------------------+--------------------+--------------------+----------------------+--------------------+-----+-----------------+-------+----------------------------+----------------------+-------------------------+
|gene symbol|gene mim|        disease name|   disease mim|confidence category| allelic requirement|mutation consequence|          phenotypes|organ specificity list|               pmids|panel|     prev symbols|hgnc id|gene disease pair entry date|cross cutting modifier|mutation consequence flag|
+-----------+--------+--------------------+--------------+-------------------+--------------------+--------------------+--------------------+----------------------+--------------------+-----+-----------------+-------+----------------------------+----------------------+-------------------------+
|       PTEN|  601728|    PROTEUS SYNDROME|        176920|         definitive|monoallelic_autos...| absent gene 

In [24]:
(
    new_skin
    .filter(col('mutation consequence flag').isNotNull())
    .select(col('mutation consequence flag'))
    .distinct()
#     .count()
    .show(truncate=False)
)

+----------------------------------+
|mutation consequence flag         |
+----------------------------------+
|restricted repertoire of mutations|
+----------------------------------+



### Rebuilding the evidence parser 

Let's see how the data can be loaded into a single dataframe:


In [43]:
gene2phenotype_schema = (
    StructType()
    .add('gene symbol', StringType())
    .add('gene mim', IntegerType())
    .add('disease name', StringType())
    .add('disease mim', StringType())
    .add('confidence category', StringType())
    .add('allelic requirement', StringType())
    .add('mutation consequence', StringType())
    .add('phenotypes', StringType())
    .add('organ specificity list', StringType())
    .add('pmids', StringType())
    .add('panel', StringType())
    .add('prev symbols', StringType())
    .add('hgnc id', IntegerType())
    .add('gene disease pair entry date', TimestampType())
    .add('cross cutting modifier', StringType())
    .add('mutation consequence flag', StringType())
)

full_df = (
    spark.read.csv(
        [paths['new'].format(file_type) for file_type in file_types], 
        schema=gene2phenotype_schema, enforceSchema=True, header=True
    )
    .persist()
)

print(full_df.count())
full_df.printSchema()

3898
root
 |-- gene symbol: string (nullable = true)
 |-- gene mim: integer (nullable = true)
 |-- disease name: string (nullable = true)
 |-- disease mim: string (nullable = true)
 |-- confidence category: string (nullable = true)
 |-- allelic requirement: string (nullable = true)
 |-- mutation consequence: string (nullable = true)
 |-- phenotypes: string (nullable = true)
 |-- organ specificity list: string (nullable = true)
 |-- pmids: string (nullable = true)
 |-- panel: string (nullable = true)
 |-- prev symbols: string (nullable = true)
 |-- hgnc id: integer (nullable = true)
 |-- gene disease pair entry date: timestamp (nullable = true)
 |-- cross cutting modifier: string (nullable = true)
 |-- mutation consequence flag: string (nullable = true)



* `mutation consequence flag`: 343 non-null rows, 2 differet unique values
* `cross cutting modifier`: 32 non-null rows, 4 unique values

In [51]:
(
    full_df
    .filter(col('cross cutting modifier').isNotNull())
    .select('cross cutting modifier')
#     .distinct()
#     .show(truncate=False)
    .count()
#     .show(1, vertical=True, truncate=False)
)

32

In [60]:
%%bash 


gzcat /Users/dsuveges/project_data/gene2phenotype/new/CancerG2P.csv.gz | head -n1 | tr "," "\n" 


# zgrep "Confidence value flag" /Users/dsuveges/project_data/gene2phenotype/new/*gz

"gene symbol"
"gene mim"
"disease name"
"disease mim"
"confidence category"
"allelic requirement"
"mutation consequence"
phenotypes
"organ specificity list"
pmids
panel
"prev symbols"
"hgnc id"
"gene disease pair entry date"
"cross cutting modifier"
"mutation consequence flag"


In [65]:
(
    full_df
    .filter(col('confidence category').isNotNull())
    .select('confidence category')
    .distinct()
    .show(truncate=False)
#     .count()
#     .show(1, vertical=True, truncate=False)
)

+-------------------+
|confidence category|
+-------------------+
|definitive         |
|both RD and IF     |
|strong             |
|limited            |
+-------------------+

