# Step 2: Define Connection Masks

In [None]:
## Connectivity Matrices
# The connections between the layers are defined by the connectivity matrix.

# The matrix has the shape of (N_nodes_layer_1, N_nodes_layer_2).
# It is a sparse matrix with zeros for no connections and ones if there is a connections. For example.


#             output
#           1 2 3 4 5
# input 1 | 1 0 0 0 0 |
# input 2 | 1 1 0 0 0 |
# input 3 | 0 1 0 0 0 |
# input 4 | 0 1 0 0 0 |
# input 5 | 0 0 1 0 0 |
# input 6 | 0 0 0 1 0 |
# input 7 | 0 0 0 1 0 |


# This connects the first two inputs (1,2) to the first neuron in the second layer.
# Connects input 2,3 and 4 to output neuron 2.
# Connects input 5 to output neuron 3
# Connects input 6 and 7 o the 4th neuron in the subsequent layer
# Connects nothing to the 5th neuron
#
# Check the alternative define connections notebook for SNPs connecting to multiple genes.
#
#imports & paths

import numpy as np
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
%matplotlib inline
import scipy
import h5py
import tables
from scipy import stats
import glob
import itertools
import tables
import tqdm

basepath = os.getcwd()[:-4]
hasepath = basepath + "/processed_data/" 
savepath = basepath + "/processed_data/"
studyname = str(np.load(savepath + "studyname.npy"))

## Generate Input files for Annovar

In [None]:
probes = pd.read_hdf(hasepath + '/probes/'+studyname+'.h5')
print(probes.shape)
probes.head()

if os.path.exists(hasepath +'/probes/'+studyname+'_hash_table.csv.gz'):
    hashtable = pd.read_csv(hasepath +'/probes/'+studyname+'_hash_table.csv.gz', compression="gzip", sep='\t')
else: 
    hashtable = pd.read_csv(hasepath +'/probes/'+studyname+'_hash_table.csv', sep='\t')



hashtable['allele1']  = hashtable['keys']
unhashed_probes = probes.merge(hashtable, on='allele1', how = "left" )
unhashed_probes = unhashed_probes.drop(columns=["keys", "allele1"])
unhashed_probes = unhashed_probes.rename(columns = {'allele':'allele1'})

#reload hashtable for other allele

if os.path.exists(hasepath +'/probes/'+studyname+'_hash_table.csv.gz'):
    hashtable = pd.read_csv(hasepath +'/probes/'+studyname+'_hash_table.csv.gz', compression="gzip", sep='\t')
else: 
    hashtable = pd.read_csv(hasepath +'/probes/'+studyname+'_hash_table.csv', sep='\t')

hashtable['allele2']  = hashtable['keys']
unhashed_probes = unhashed_probes.merge(hashtable, on='allele2', how = "left")
unhashed_probes = unhashed_probes.drop(columns=["keys", "allele2"])
unhashed_probes = unhashed_probes.rename(columns = {'allele':'allele2'})

#clean up
annovar_input = unhashed_probes.drop(columns=["ID","distance"])
annovar_input["bp2"] = annovar_input["bp"]
annovar_input["index_col"] = annovar_input.index
annovar_input = annovar_input[['CHR', 'bp',"bp2","allele1","allele2","index_col"]]

print('Number of variants',annovar_input.shape)

annovar_input_path = savepath + '/annovar_input_'+studyname+'.csv'
annovar_input.to_csv(annovar_input_path,sep="\t", index=False, header = False)
annovar_input.head()


#### The input should have this structure:
![title](figures/annovar_input_example.png)

## Run Annovar

In [None]:
print("install annovar: https://doc-openbio.readthedocs.io/projects/annovar/en/latest/user-guide/download/")
print("Navigate to annovar, e.g cd /home/charlesdarwin/annovar/")
print("Update annovar: annotate_variation.pl -buildver hg19 -downdb -webfrom annovar refGene humandb/")
print("Run: perl annotate_variation.pl -geneanno -dbtype refGene -buildver hg19 "+str(savepath)+"/annovar_input_"+str(studyname)+".csv humandb --outfile "+str(savepath)+"/"+str(studyname)+"_RefGene")

## Create mask from gene annotations

Here we create the mask for the gene layer. Note the layer here assigns all gees

In [None]:
gene_annotation = pd.read_csv(savepath +str(studyname)+"_RefGene.variant_function",sep='\t', header=None)
print(gene_annotation.shape)
gene_annotation.head()

In [None]:
gene_annotation.columns = ['into/exonic', 'gene', 'chr', 'bps', 'bpe', "mutation1" ,"mutation2" ,'index_col']
annovar_annotated = annovar_input.merge(gene_annotation[['into/exonic', 'gene',"index_col"]], on='index_col', how = "left")
print("Number of Nulls",annovar_annotated["gene"].isnull().sum())
print("annotated shape:",annovar_annotated["gene"].shape)
print("number of unique genes:",len(gene_annotation["gene"].unique()))
annovar_annotated['gene'] = annovar_annotated['gene'].str.replace(r"\(.*\)","",)
annovar_annotated['gene'] = annovar_annotated['gene'].str.replace(r"\,.*","",)
annovar_annotated.head()

In [None]:
# Select annotated, autosomal and SNPs with std > 0. Adjust here if you want to use other criteria

# annovar_annotated["snp_std"] = np.load(savepath + studyname + '_std.npy')
annovar_annotated  = annovar_annotated[(annovar_annotated['gene'] != "NONE")
                                       & (annovar_annotated['CHR'] < 23)]
annovar_annotated = annovar_annotated.dropna()
print(annovar_annotated.shape)
annovar_annotated.head()

In [None]:
gene_list = annovar_annotated.drop_duplicates("gene")
gene_list["gene_id"] = np.arange(len(gene_list))
gene_list = gene_list.sort_values(by = ["CHR","bp"], ascending=[True, True])
gene_list = gene_list[["gene","gene_id"]]

annovar_annotated = annovar_annotated.merge(gene_list, on="gene")
annovar_annotated = annovar_annotated.sort_values(by = "index_col", ascending = True)
annovar_annotated['tot_index'] = np.arange(len(annovar_annotated))
annovar_annotated.tail()

In [None]:
gene_end = annovar_annotated.groupby("CHR")["gene_id"].max().values
gene_end = np.insert(gene_end,0,0)
print(gene_end)
np.save(savepath + "gene_end", gene_end)

In [None]:
annovar_annotated.to_csv(savepath + 'annovar_annotated.csv', sep="\t", index=False)
gene_overview = annovar_annotated.drop_duplicates("gene")
gene_overview.to_csv(savepath + 'gene_overview.csv',sep="\t", index=False)

In [None]:
# Mask including all variants based on the earlier criteria
data = np.ones(len(annovar_annotated), np.bool)
coord = ( annovar_annotated["tot_index"].values, annovar_annotated["gene_id"].values )
SNP_gene_matrix = scipy.sparse.coo_matrix(((data),coord),  shape = (len(annovar_annotated), annovar_annotated["gene_id"].max()+1 ))
scipy.sparse.save_npz(savepath +'/SNP_gene_mask', SNP_gene_matrix)
SNP_gene_matrix