# Updating variant table for OT Genetics Portal

**Code base:** [genetics-variant-annotation](https://github.com/opentargets/genetics-variant-annotation)

## Figure out hail

1. What version should we use? -> 0.2 with the available spark version 2.4

Get the corresponding hash:

In [3]:
%%bash

HASH=$(gsutil cat gs://hail-common/builds/0.2/latest-hash/cloudtools-5-spark-2.4.0.txt)
echo $HASH

f3a54b5309797140ecd15811834132d9e1fafedf


In [6]:
def parse_population_keys(pop_list):
    ''' Takes a list of population names and filters to:
        - Keep gnomad only
        - Remove male/female sub-stats
        - Remove _raw (stats before any sample filtering)
    Params:
        pop_list (list of str)
    Returns:
        Filtered pop_list (list of str)
    '''
    pop_filt = []
    for pop in pop_list:
        if (
              pop.startswith('gnomad_') and not
              pop.endswith('_raw') and not
              pop.endswith('_male') and not
              pop.endswith('_female')
           ):
           pop_filt.append(pop)
    return pop_filt





In [17]:
import hail as hl
import sys

# Args
version = '2021.07.19'

# Gnomad hail table:
hail_table = 'gs://gcp-public-data--gnomad/release/3.1.1/ht/genomes/gnomad.genomes.v3.1.1.sites.ht'

# Grch38 to 37 chainfile:
chain_file = 'gs://hail-common/references/grch38_to_grch37.over.chain.gz'

# Output 
out_parquet = f'gs://ot-team/dsuveges/variant_table/{version}/variant-annotation.parquet'
out_sitelist = f'gs://ot-team/dsuveges/variant_table/{version}//variant-annotation.sitelist.tsv.gz'

out_partitions = 256
maf_threshold = 0.001 # 0.1%


In [19]:
ht = hl.read_table(hail_table).head(10_000)
print('Total number of rows: ', ht.count())

# Generate smaller dataset:
print('Variants pre-filtering: ', ht.count())
ht = ht.filter(ht.filters.length() == 0)
print('Variants post-quality filter: ', ht.count())


# Population of interest:
populations = {
    'afr-adj', 
    'amr-adj', 
    'ami-adj', 
    'asj-adj', 
    'eas-adj', 
    'fin-adj', 
    'nfe-adj', 
    'mid-adj', 
    'sas-adj', 
    'oth-adj'
}
af_to_maf = lambda af: hl.if_else(af <= 0.5, af, 1 - af)
population_indices = ht.globals.freq_index_dict.collect()[0]
population_indices = {pop: index for pop,index in population_indices.items() if pop in populations}

# Adding population allele frequency and minor allele frequency:
ht = ht.annotate(
    # Generate struct for alt. allele frequency in selected populations:
    af = hl.struct(**{pop: ht.freq[index].AF for pop, index in population_indices.items()}),
    
    # Generate struct for minor allele frequency for selected populations:
    maf = hl.struct(**{pop: af_to_maf(ht.freq[index].AF) for pop, index in population_indices.items()}),
    
    # Generate an _array_ with maf values for further filtering:
    maf_values = hl.array([af_to_maf(ht.freq[index].AF) for pop, index in population_indices.items()])
)

# Applying maf threshold:
ht = ht.filter(hl.max(ht.maf_values) > maf_threshold)
print(f'Number of variants after applying MAF filter: {ht.count()}')

# # These liftover issues are not yet fully convincing:
grch37 = hl.get_reference('GRCh37')
grch38 = hl.get_reference('GRCh38')
grch38.add_liftover(chain_file, grch37)

# # Liftover
# ht = ht.annotate(
#     locus_GRCh37 = hl.liftover(ht.locus, 'GRCh37')
# )

FatalError: SocketTimeoutException: connect timed out

Java stack trace:
java.io.IOException: Error getting access token from metadata server at: http://169.254.169.254/computeMetadata/v1/instance/service-accounts/default/token
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.util.CredentialFactory.getCredentialFromMetadataServiceAccount(CredentialFactory.java:246)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.util.CredentialFactory.getCredential(CredentialFactory.java:462)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.getCredential(GoogleHadoopFileSystemBase.java:1443)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.createGcsFs(GoogleHadoopFileSystemBase.java:1510)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.configure(GoogleHadoopFileSystemBase.java:1486)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.initialize(GoogleHadoopFileSystemBase.java:541)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.initialize(GoogleHadoopFileSystemBase.java:494)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3303)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
	at is.hail.io.fs.HadoopFS.fileStatus(HadoopFS.scala:164)
	at is.hail.io.fs.FS.isDir(FS.scala:175)
	at is.hail.io.fs.FS.isDir$(FS.scala:173)
	at is.hail.io.fs.HadoopFS.isDir(HadoopFS.scala:70)
	at is.hail.expr.ir.RelationalSpec$.readMetadata(AbstractMatrixTableSpec.scala:30)
	at is.hail.expr.ir.RelationalSpec$.readReferences(AbstractMatrixTableSpec.scala:68)
	at is.hail.variant.ReferenceGenome$.fromHailDataset(ReferenceGenome.scala:596)
	at is.hail.variant.ReferenceGenome.fromHailDataset(ReferenceGenome.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)

java.net.SocketTimeoutException: connect timed out
	at java.net.PlainSocketImpl.socketConnect(Native Method)
	at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
	at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
	at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
	at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
	at java.net.Socket.connect(Socket.java:589)
	at sun.net.NetworkClient.doConnect(NetworkClient.java:175)
	at sun.net.www.http.HttpClient.openServer(HttpClient.java:432)
	at sun.net.www.http.HttpClient.openServer(HttpClient.java:527)
	at sun.net.www.http.HttpClient.<init>(HttpClient.java:211)
	at sun.net.www.http.HttpClient.New(HttpClient.java:308)
	at sun.net.www.http.HttpClient.New(HttpClient.java:326)
	at sun.net.www.protocol.http.HttpURLConnection.getNewHttpClient(HttpURLConnection.java:1202)
	at sun.net.www.protocol.http.HttpURLConnection.plainConnect0(HttpURLConnection.java:1138)
	at sun.net.www.protocol.http.HttpURLConnection.plainConnect(HttpURLConnection.java:1032)
	at sun.net.www.protocol.http.HttpURLConnection.connect(HttpURLConnection.java:966)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.http.javanet.NetHttpRequest.execute(NetHttpRequest.java:148)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.http.javanet.NetHttpRequest.execute(NetHttpRequest.java:84)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.http.HttpRequest.execute(HttpRequest.java:1012)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.util.CredentialFactory$ComputeCredentialWithRetry.executeRefreshToken(CredentialFactory.java:184)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.auth.oauth2.Credential.refreshToken(Credential.java:494)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.util.CredentialFactory.getCredentialFromMetadataServiceAccount(CredentialFactory.java:243)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.util.CredentialFactory.getCredential(CredentialFactory.java:462)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.getCredential(GoogleHadoopFileSystemBase.java:1443)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.createGcsFs(GoogleHadoopFileSystemBase.java:1510)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.configure(GoogleHadoopFileSystemBase.java:1486)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.initialize(GoogleHadoopFileSystemBase.java:541)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.initialize(GoogleHadoopFileSystemBase.java:494)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3303)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
	at is.hail.io.fs.HadoopFS.fileStatus(HadoopFS.scala:164)
	at is.hail.io.fs.FS.isDir(FS.scala:175)
	at is.hail.io.fs.FS.isDir$(FS.scala:173)
	at is.hail.io.fs.HadoopFS.isDir(HadoopFS.scala:70)
	at is.hail.expr.ir.RelationalSpec$.readMetadata(AbstractMatrixTableSpec.scala:30)
	at is.hail.expr.ir.RelationalSpec$.readReferences(AbstractMatrixTableSpec.scala:68)
	at is.hail.variant.ReferenceGenome$.fromHailDataset(ReferenceGenome.scala:596)
	at is.hail.variant.ReferenceGenome.fromHailDataset(ReferenceGenome.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)




Hail version: 0.2.72-cfce5e858cab
Error summary: SocketTimeoutException: connect timed out

```
Variants pre-filtering:  100000
Variants post-quality filter:  25468=========================>    (13 + 1) / 14]
```

The 100_000 variant is dropped to 25k after applying filter.

In [14]:
grch37 = hl.get_reference('GRCh37')  # doctest: +SKIP
grch38 = hl.get_reference('GRCh38')  # doctest: +SKIP
grch38.add_liftover(chain_file, grch37)  # doctest: +SKIP

# Liftover
ht = ht.annotate(
    locus_GRCh37 = hl.liftover(ht.locus, 'GRCh37')
)

# Adding build specific coordinates to the table:
ht = ht.annotate(
    chrom_b38 = ht.locus.contig,
    pos_b38 = ht.locus.position,
#     chrom_b37 = ht.locus_GRCh37.contig.replace('chr', ''),
#     pos_b37 = ht.locus_GRCh37.position,
    ref = ht.alleles[0],
    alt = ht.alleles[1]
)

# Selecting relevant column
ht = ht.annotate(
    cadd = ht.cadd.rename({'raw_score': 'raw'}).drop('has_duplicate')
)

SyntaxError: invalid syntax (<ipython-input-14-6e3b7cb13571>, line 1)

In [None]:
# Drop all globals
ht = ht.select_globals()

# Drop unnecessary VEP fields
ht = ht.annotate(
    vep = ht.vep.drop(
        'assembly_name', # OK
        'allele_string',
        'ancestral',
#         'colocated_variants', # <- not OK
        'context',
        'end',
        'id',
        'input',
        'intergenic_consequences',
        'seq_region_name',
        'start',
        'strand',
        'variant_class'
    ),
    locus_GRCh38 = ht.locus
)

# Sort columns
col_order = ['locus_GRCh38', 
#              'chrom_b37', 'pos_b37', 
             'chrom_b38', 'pos_b38',
             'ref', 'alt', 'allele_info', 'vep', 'rsid', 'af', 'cadd']
ht = ht.select(*col_order)

# Persist as writing twice would cause re-computation
ht = ht.persist()

# Repartition and write parquet file
(
    ht.to_spark(flatten=False)
      .repartition(out_partitions)
      .write.parquet(out_parquet)
)

# Export site list
cols = ['chrom_b37', 'pos_b37', 'chrom_b38', 'pos_b38', 'ref', 'alt', 'rsid']
(
    ht.select(*cols)
      .export(out_sitelist)
)

In [None]:
(
    ht.to_spark(flatten=False)
      .repartition(out_partitions)
      .write.json(out_parquet.replace('parquet', 'json'))
)

In [25]:
import json

json_string = '{"header": "Ez valami header", "body": "Valami", "footer": "Footer, amit eldobunk"}'
data = json.loads(json_string)

for dropped_key in ['footer', 'header']:
    if dropped_key in data: del data[dropped_key]

print(data)

{'body': 'Valami'}


In [27]:
import socket
import time
 
HOST = "127.0.0.1"
PORT = 9999
PACKET_SIZE = 110000
 
class Client():
 
    def __init__(self):
        self.raw_data = None
        self.data = None
 
    def receive_data(self):
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as x:
            x.connect((HOST, PORT))
            self.raw_data = x.recv(PACKET_SIZE).strip()
 
    def deserialize_data(self):
        self.data = json.loads(self.raw_data["data"]["status"])

client = Client()
 
while True:
    time.sleep(2)
    client.receive_data()
    client.deserialize_data()
    print(client.data)

KeyboardInterrupt: 