# Step 2: Define Connection Masks

In [1]:
## Connectivity Matrices
# The connections between the layers are defined by the connectivity matrix.

# The matrix has the shape of (N_nodes_layer_1, N_nodes_layer_2).
# It is a sparse matrix with zeros for no connections and ones if there is a connections. For example.


#             output
#           1 2 3 4 5
# input 1 | 1 0 0 0 0 |
# input 2 | 1 1 0 0 0 |
# input 3 | 0 1 0 0 0 |
# input 4 | 0 1 0 0 0 |
# input 5 | 0 0 1 0 0 |
# input 6 | 0 0 0 1 0 |
# input 7 | 0 0 0 1 0 |


# This connects the first two inputs (1,2) to the first neuron in the second layer.
# Connects input 2,3 and 4 to output neuron 2.
# Connects input 5 to output neuron 3
# Connects input 6 and 7 o the 4th neuron in the subsequent layer
# Connects nothing to the 5th neuron

#imports & paths

import numpy as np
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
%matplotlib inline
import scipy
import h5py
import tables
from scipy import stats
import glob
import itertools
import tables
import tqdm

basepath = os.getcwd()[:-4]
hasepath = basepath + "/hase/"
savepath = basepath + "/processed_data/"
studyname = str(np.load(savepath + "studyname.npy"))

## Generate Input files for Annovar

In [2]:
probes = pd.read_hdf(hasepath + '/probes/'+studyname+'.h5')
print(probes.shape)
probes.head()

if os.path.exists(hasepath +'/probes/'+studyname+'_hash_table.csv.gz'):
    hashtable = pd.read_csv(hasepath +'/probes/'+studyname+'_hash_table.csv.gz', compression="gzip", sep='\t')
else: 
    hashtable = pd.read_csv(hasepath +'/probes/'+studyname+'_hash_table.csv', sep='\t')



hashtable['allele1']  = hashtable['keys']
unhashed_probes = probes.merge(hashtable, on='allele1', how = "left" )
unhashed_probes = unhashed_probes.drop(columns=["keys", "allele1"])
unhashed_probes = unhashed_probes.rename(columns = {'allele':'allele1'})

#reload hashtable for other allele

if os.path.exists(hasepath +'/probes/'+studyname+'_hash_table.csv.gz'):
    hashtable = pd.read_csv(hasepath +'/probes/'+studyname+'_hash_table.csv.gz', compression="gzip", sep='\t')
else: 
    hashtable = pd.read_csv(hasepath +'/probes/'+studyname+'_hash_table.csv', sep='\t')

hashtable['allele2']  = hashtable['keys']
unhashed_probes = unhashed_probes.merge(hashtable, on='allele2', how = "left")
unhashed_probes = unhashed_probes.drop(columns=["keys", "allele2"])
unhashed_probes = unhashed_probes.rename(columns = {'allele':'allele2'})

#clean up
annovar_input = unhashed_probes.drop(columns=["ID","distance"])
annovar_input["bp2"] = annovar_input["bp"]
annovar_input["index_col"] = annovar_input.index
annovar_input = annovar_input[['CHR', 'bp',"bp2","allele1","allele2","index_col"]]

print('Number of variants',annovar_input.shape)

annovar_input_path = savepath + '/annovar_input_'+studyname+'.csv'
annovar_input.to_csv(annovar_input_path,sep="\t", index=False, header = False)
annovar_input.head()


(823497, 6)
Number of variants (8590, 2)
(8590, 2)
(823497, 6)
(823497, 6)
/media/avanhilten/pSSD450/GenNet//processed_data//annovar_input_BulgarianTrio.csv


Unnamed: 0,CHR,bp,bp2,allele1,allele2,index_col
0,1,15903,15903,GC,G,0
1,1,63735,63735,C,CCTA,1
2,1,120994,120994,A,AAT,2
3,1,267227,267227,T,TTAA,3
4,1,267300,267300,TTA,T,4


## Run Annovar

In [3]:
print("install annovar: https://doc-openbio.readthedocs.io/projects/annovar/en/latest/user-guide/download/")
print("Navigate to annovar, e.g cd /home/charlesdarwin/annovar/")
print("Update annovar: annotate_variation.pl -buildver hg19 -downdb -webfrom annovar refGene humandb/")
print("Run: perl annotate_variation.pl -geneanno -dbtype refGene -buildver hg19 "+str(savepath)+"/annovar_input_"+str(studyname)+".csv humandb --outfile "+str(savepath)+"/"+str(studyname)+"_RefGene")

Navigate to annovar (cd /home/avanhilten/annovar/)
Run: perl annotate_variation.pl -geneanno -dbtype refGene -buildver hg19 /media/avanhilten/pSSD450/GenNet//processed_data//annovar_input_BulgarianTrio.csv humandb --outfile /media/avanhilten/pSSD450/GenNet//processed_data//BulgarianTrio_RefGene


## Create mask from gene annotations

Here we create the mask for the gene layer. Note the layer here assigns all gees

In [4]:
gene_annotation = pd.read_csv(savepath +str(studyname)+"_RefGene.variant_function",sep='\t', header=None)
print(gene_annotation.shape)
gene_annotation.head()

(800465, 8)


Unnamed: 0,0,1,2,3,4,5,6,7
0,intergenic,"FAM138A(dist=27654),OR4F5(dist=5356)",1,63735,63735,C,CCTA,1
1,intergenic,"OR4F5(dist=50986),LOC729737(dist=13779)",1,120994,120994,A,AAT,2
2,intergenic,"LOC729737(dist=126661),LOC100132287(dist=56665)",1,267227,267227,T,TTAA,3
3,intergenic,"LOC100288069(dist=3358),FAM87B(dist=35325)",1,717426,717426,C,CT,5
4,intergenic,"LOC100288069(dist=31409),FAM87B(dist=7274)",1,745477,745477,T,C,6


In [5]:
gene_annotation.columns = ['into/exonic', 'gene', 'chr', 'bps', 'bpe', "mutation1" ,"mutation2" ,'index_col']
annovar_annotated = annovar_input.merge(gene_annotation[['into/exonic', 'gene',"index_col"]], on='index_col', how = "left")
print("Number of Nulls",annovar_annotated["gene"].isnull().sum())
print("annotated shape:",annovar_annotated["gene"].shape)
print("number of unique genes:",len(gene_annotation["gene"].unique()))
annovar_annotated['gene'] = annovar_annotated['gene'].str.replace(r"\(.*\)","",)
annovar_annotated['gene'] = annovar_annotated['gene'].str.replace(r"\,.*","",)
annovar_annotated.head()

Number of Nulls 23032
annotated shape: (823497,)
number of unique genes: 84483


Unnamed: 0,CHR,bp,bp2,allele1,allele2,index_col,into/exonic,gene
0,1,15903,15903,GC,G,0,,
1,1,63735,63735,C,CCTA,1,intergenic,FAM138A
2,1,120994,120994,A,AAT,2,intergenic,OR4F5
3,1,267227,267227,T,TTAA,3,intergenic,LOC729737
4,1,267300,267300,TTA,T,4,,


In [6]:
# Select annotated, autosomal and SNPs with std > 0. Adjust here if you want to use other criteria
annovar_annotated["snp_std"] = np.load(savepath + studyname + '_std.npy')
annovar_annotated  = annovar_annotated[(annovar_annotated['gene'] != "NONE")
                                       & (annovar_annotated['CHR'] < 23)
                                       & (annovar_annotated['snp_std'] > 0)]
annovar_annotated = annovar_annotated.dropna()
print(annovar_annotated.shape)
annovar_annotated.head()

(754132, 9)


Unnamed: 0,CHR,bp,bp2,allele1,allele2,index_col,into/exonic,gene,snp_std
1,1,63735,63735,C,CCTA,1,intergenic,FAM138A,0.352587
2,1,120994,120994,A,AAT,2,intergenic,OR4F5,0.16912
3,1,267227,267227,T,TTAA,3,intergenic,LOC729737,0.114758
5,1,717426,717426,C,CT,5,intergenic,LOC100288069,0.032759
6,1,745477,745477,T,C,6,intergenic,LOC100288069,0.04023


In [7]:
gene_list = annovar_annotated.drop_duplicates("gene")
gene_list["gene_id"] = np.arange(len(gene_list))
gene_list = gene_list[["gene","gene_id"]]

annovar_annotated = annovar_annotated.merge(gene_list, on="gene")
annovar_annotated = annovar_annotated.sort_values(by = "index_col", ascending = True)
annovar_annotated['tot_index'] = np.arange(len(annovar_annotated))
annovar_annotated.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,CHR,bp,bp2,allele1,allele2,index_col,into/exonic,gene,snp_std,gene_id,tot_index
754130,22,51215123,51215123,T,C,804618,exonic,RABL2B,0.100819,21463,754127
754131,22,51215124,51215124,A,G,804619,exonic,RABL2B,0.023271,21463,754128
754127,22,51216327,51216327,C,T,804620,ncRNA_intronic,RPL23AP82,0.046467,21462,754129
754128,22,51216332,51216332,T,G,804621,ncRNA_intronic,RPL23AP82,0.045715,21462,754130
754129,22,51216453,51216453,T,C,804622,ncRNA_intronic,RPL23AP82,0.032749,21462,754131


In [None]:
#Finalize the input by selecting only the relevant inputs and by transposing the data

t = tables.open_file(savepath + studyname + '_genotype_imputed.h5', mode='r')
data = t.root.data
num_pat = data.shape[1]
num_feat = data.shape[0]

used_indices = np.zeros(num_feat)
used_indices[annovar_annotated.index_col.values]  =1 

f = tables.open_file(savepath + studyname + '_genotype_processed.h5', mode='w')
array_c = f.create_earray(f.root, 'data', tables.IntCol(), (0,num_feat ), expectedrows=num_pat,filters=tables.Filters(complib='zlib', complevel=1))
f.close()

f = tables.open_file(savepath + studyname + '_genotype_processed.h5', mode='a')
print("\n Finalizing and transposing data...")
for pat in tqdm.tqdm(range(num_pat)):
    a = np.transpose(data[:,pat])
    a=np.reshape(a, (1,-1))
    f.root.data.append(a)
f.close()
t.close()
print("Completed", studyname)

  0%|          | 0/1826 [00:00<?, ?it/s]


 Finalizing and transposing data...


 18%|█▊        | 322/1826 [11:42<55:58,  2.23s/it]  

In [None]:
# Mask including all variants
data = np.ones(len(annovar_annotated), np.bool)
coord = ( annovar_annotated["tot_index"].values, annovar_annotated["gene_id"].values )
SNP_gene_matrix = scipy.sparse.coo_matrix(((data),coord),  shape = (len(annovar_annotated), annovar_annotated["gene_id"].max()+1 ))
scipy.sparse.save_npz(savepath +'/SNP_gene_mask', SNP_gene_matrix)
SNP_gene_matrix

In [None]:
# Mask including only exonic variants
annovar_annotated_exonic = annovar_annotated[annovar_annotated['into/exonic'] == "exonic"]
data = np.ones(len(annovar_annotated_exonic), np.bool)
coord = ( annovar_annotated_exonic["tot_index"].values, annovar_annotated_exonic["gene_id"].values )
SNP_gene_matrix = scipy.sparse.coo_matrix(((data),coord),  shape = (len(annovar_annotated), annovar_annotated["gene_id"].max()+1 ))
scipy.sparse.save_npz(savepath +'/SNP_gene_matrix_exonic_merged', SNP_gene_matrix)
SNP_gene_matrix