# Variant annotation

The scope of the notebook to annotate a list of random variants with both GRCh37 and GRCh38 data. The data is sourced from Gnomad 3.1. The list of variants are provided by David. These variants were looked up in the Genetics Portal site but seemingly has not much point in the inquery.

1. Read data as pyspark dataframe.
2. Parse variants. 
3. Create hail table.
4. Lift over to the old build.

In [12]:
from pyspark.sql.functions import (
    col, array, struct, concat, lit, split, element_at, when,length
)
from pyspark.sql.types import StructField, IntegerType
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.context import SparkContext

import hail as hl

# Hail session needs to be initialized BEFORE spark initialization:
hl.init()

sparkConf = SparkConf()
sparkConf = sparkConf.set('spark.hadoop.fs.gs.requester.pays.mode', 'AUTO')
sparkConf = sparkConf.set('spark.hadoop.fs.gs.requester.pays.project.id',
                          'open-targets-eu-dev')

# establish spark connection
spark = (
    SparkSession.builder
    .config(conf=sparkConf)
    .getOrCreate()
)




In [17]:
# Reading data from web:


b38variants = hl.Table.from_spark(
    # Reading variants:
    spark.read.csv('hdfs:///mydata/qfuJ67V5', header=False)
    .withColumnRenamed('_c0', 'variantId')

    # Parsing variant details:
    .withColumn('details', split(col('variantId'), '_'))
    .select(
        '*',
        element_at(col('details'), 2).alias('pos').cast(IntegerType()),
        element_at(col('details'), 3).alias('ref'),
        element_at(col('details'), 4).alias('alt')
    )
    # Filtering out variants with no parsable variant id:
    .filter(col('pos').isNotNull())

    # Filter out variants with non sensible chromosome:
    .withColumn('chrom', element_at(col('details'), 1))

    # Update chromosome:
    .withColumn(
        'chrom',

        # The x chromosome is written as 23 in FINNGEN:
        when(col('chrom') == '23', lit('chrX'))

        # Remove if the length of the chromosome is not between 1 and 2:
        .when((length(col('chrom')) > 2) | (length(col('chrom')) < 1), lit(None))
        

        .otherwise(concat(lit('chr'), col('chrom')))
    )
    .drop('details')
    .persist()
)

b38variants.show()

Py4JJavaError: An error occurred while calling o265.csv.
: java.io.IOException: Incomplete HDFS URI, no host: hdfs:///mydata/qfuJ67V5
	at org.apache.hadoop.hdfs.DistributedFileSystem.initialize(DistributedFileSystem.java:170)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3303)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:46)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:377)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:325)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:307)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:307)
	at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:795)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:832)


In [None]:

b38variants = (
    # int64 is not accepted, needs to convert to int32:
    b38variants.annotate(pos = hl.int32(b38variants.pos))
)

# Adding locus and allels columns:
b38variants = (
    b38variants
    .annotate(
        # Creating locus object:
        locus = hl.locus(
            b38variants.chrom, 
            b38variants.pos, 
            reference_genome='GRCh38'
        ),

        # Creating array of alleles:
        alleles = hl.array([b38variants.ref, b38variants.alt])
    )
)

b38variants = (
    b38variants
    # Indexing dataset by locus and alleles + dropping all other colums:
    .key_by(b38variants.locus, b38variants.alleles)
    .drop(*['chrom', 'pos', 'alt', 'ref'])
)

b38variants.show()

In [None]:
# fields to extract:
## vep.most_severe_consequence
## rsid
## clinvar.clinvar_disease_name


# Gnomad hail table:
gnomad_file = 'gs://gcp-public-data--gnomad/release/3.1.1/ht/genomes/gnomad.genomes.v3.1.1.sites.ht'

# Load data
gnomad_table = hl.read_table(gnomad_file)

b38_annotated = (
    gnomad_table
    .select('rsid', gnomad_table.vep.most_severe_consequence)
    .join(b38variants, how='right')
    .persist()
)

b38_annotated.show()