In [42]:

import json
from json import JSONDecodeError

import requests
from functools import reduce
import pandas as pd
import pyspark.sql.functions as F

from pyspark.sql.types import (
    FloatType, ArrayType, StructType, StructField, BooleanType, StringType, IntegerType
)
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from collections import defaultdict
from pyspark.context import SparkContext

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

In [2]:
import requests 

url = 'https://www.ebi.ac.uk/pdbe/graph-api/mappings/ensembl/1dqa'


In [4]:
data = requests.get(url).json()


In [7]:
df = (
    spark.createDataFrame(data['1dqa']['Ensembl']['ENSG00000113161']['mappings'])
    .persist()
)

df.count()
df.printSchema()

root
 |-- accession: string (nullable = true)
 |-- chain_id: string (nullable = true)
 |-- coverage: double (nullable = true)
 |-- end: map (nullable = true)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)
 |-- entity_id: long (nullable = true)
 |-- exon_id: string (nullable = true)
 |-- genome_end: long (nullable = true)
 |-- genome_start: long (nullable = true)
 |-- ordinal: long (nullable = true)
 |-- start: map (nullable = true)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)
 |-- struct_asym_id: string (nullable = true)
 |-- transcript_id: string (nullable = true)
 |-- translation_id: string (nullable = true)
 |-- unp_end: long (nullable = true)
 |-- unp_start: long (nullable = true)



In [10]:
df.show(1, False, True)



-RECORD 0---------------------------------------------------------------------------------------------
 accession      | P04035-3                                                                            
 chain_id       | A                                                                                   
 coverage       | 0.533                                                                               
 end            | {author_residue_number -> 456, residue_number -> 35, author_insertion_code -> null} 
 entity_id      | 1                                                                                   
 exon_id        | ENSE00000753591                                                                     
 genome_end     | 75351601                                                                            
 genome_start   | 75351508                                                                            
 ordinal        | 1                                                      

In [15]:
dfA= (
    df.filter(
        (col('chain_id') == 'A') 
        & (col('accession') == 'P04035')
        & (col('unp_end') >= 722)
        & (col('unp_start') <= 722)
    )
    .persist()
)
dfA.count()


6

In [16]:
dfA.show(100)

+---------+--------+--------+--------------------+---------+---------------+----------+------------+-------+--------------------+--------------+---------------+---------------+-------+---------+
|accession|chain_id|coverage|                 end|entity_id|        exon_id|genome_end|genome_start|ordinal|               start|struct_asym_id|  transcript_id| translation_id|unp_end|unp_start|
+---------+--------+--------+--------------------+---------+---------------+----------+------------+-------+--------------------+--------------+---------------+---------------+-------+---------+
|   P04035|       A|     1.0|{author_residue_n...|        1|ENSE00003465683|  75359308|    75359170|      1|{author_residue_n...|             A|ENST00000343975|ENSP00000340816|    766|      720|
|   P04035|       A|     1.0|{author_residue_n...|        1|ENSE00003465683|  75359308|    75359170|      1|{author_residue_n...|             A|ENST00000511206|ENSP00000426745|    766|      720|
|   P04035|       A|     

In [18]:
import yaml
from yaml.loader import SafeLoader

# Open the file and load the file
with open('readme.yaml') as f:
    data = yaml.load(f, Loader=SafeLoader)
    print(data)

{'projectId': 'OTAR033', 'projectDescription': 'Keratinocytes', 'releaseDate': datetime.date(2021, 6, 10), 'releaseVersion': 'v1.0', 'changes': ['This thime, a new dataset was released based on experiments on a new cell line.', 'Also, a previous dataset was updated as we have re-analysed the experimental data.']}


In [21]:
def crypt(direction, text):
    '''
    Encrypt or decrypt a string
 
    Parameters:
    direction (string): Cryptographic direction (encrypt or decrypt)
    text (string): Input text
 
    Returns:
    string: Output text
    '''
    code_book = [
        ['a', '@'],
        ['A', '@'],
        ['h', '#'],
        ['H', '#']
    ]
    if direction == 'encrypt':
        char_from = 0
        char_to = 1
    elif direction == 'decrypt':
        char_from = 1
        char_to = 0
    for code_char in code_book:
        if code_char[char_from] in text:
            text = text.replace(code_char[char_from], code_char[char_to])
    return text
 

direction_choice = input('Encrypt or Decrypt (E/D): ')
while (
    direction_choice != 'e' and
    direction_choice != 'E' and
    direction_choice != 'd' and
    direction_choice != 'D'
):
    print('Wrong choice!')
    direction_choice = input('Encrypt or Decrypt (E/D): ')

if direction_choice == 'e' or direction_choice == 'E':
    input_text = input('Unencrypted text: ')
    print('Encrypted text: ' + crypt('encrypt', input_text))
elif direction_choice == 'd' or direction_choice == 'D':
    input_text = input('Encrypted text: ')
    print('Decrypted text: ' + crypt('decrypt', input_text))
 

Encrypt or Decrypt (E/D): E
Unencrypted text: firmos cica hajj
Encrypted text: firmos cic@ #@jj


In [23]:
from decimal import Decimal

def get_exponent(number: float) -> int:
    """Get the exponent of a number."""
    (sign, digits, exponent) = Decimal(number).as_tuple()
    return len(digits) + exponent - 1

def get_mantissa(number: float) -> float:
    """Get the mantissa of a number."""
    return float(Decimal(number).scaleb(-get_exponent(number)).normalize())

In [29]:
burden_evidence_sets = [
    # Generate evidence from regeneron data:
    process_regeneron_gene_burden(regeneron_data, gwas_studies, spark_instance=spark_instance),
    
    # Generate evidence from AZ data:
    process_az_gene_burden(az_binary_data, az_quant_data, spark_instance=spark_instance)
]

burden_evidence = reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), burden_evidence_sets)



DecimalTuple(sign=0, digits=(1, 2, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 6, 1, 1, 5, 4, 0, 3, 8, 0, 2, 1, 2, 2, 6, 1, 4, 2, 5, 1, 8, 5, 7, 1, 5, 7, 7, 9, 6, 6, 2, 1, 3, 2, 2, 6, 3, 1, 8, 3, 5, 9, 3, 7, 5), exponent=-65)

In [51]:
from functools import reduce

label_map = {
    'cica': 'A',
    'kutya': 'B',
    'pocok': 'C'
}

(
    spark.createDataFrame([
        {'a': 0.00000012, 'b': 'cica'},
        {'a': 0.000001012, 'b': 'kutya'},
        {'a': 0.000000000052, 'b': 'kutya'},
        {'a': 0.008, 'b': 'cica'},
        {'a': 0.0000000002, 'b': 'pocok'}
    ])
    .withColumn('exponent', F.log10(F.col('a')).cast(IntegerType()) - lit(1))
    .withColumn('mantissa', F.col('a') / F.pow(F.lit(10), col('exponent')))
    .replace(to_replace=label_map, subset=['b'])
    .show()
)

+--------+---+--------+-----------------+
|       a|  b|exponent|         mantissa|
+--------+---+--------+-----------------+
|  1.2E-7|  A|      -7|              1.2|
|1.012E-6|  B|      -6|            1.012|
| 5.2E-11|  B|     -11|5.199999999999999|
|   0.008|  A|      -3|              8.0|
| 2.0E-10|  C|     -10|              2.0|
+--------+---+--------+-----------------+



In [34]:
from math import log10, exp



In [39]:
n = 0.00013

exponent = int(log10(n)) + 1
mantissa = n / (10**exponent)

0.013

In [38]:
get_exponent(n)

-4