In [1]:
import os
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

import random
import sys 

import tensorly as tl
from tensorly.decomposition import non_negative_parafac
import scipy.sparse
from scipy.spatial.distance import pdist, squareform


In [2]:
def Qmat(S,configuration,path):
    dirname = os.fsdecode(configuration)
    filename = os.path.join(path, dirname+'/coords.csv_sparse_graph.npz')
    if os.path.isfile(filename): 
        A = scipy.sparse.load_npz(filename)
        '''construct modularity matrix'''
        M = A
        k = A.sum(axis=0)
        w = A.sum(axis=None)
        M = A - np.outer(k,k)*0.5/w
        output = M.shape[0]*M.shape[1]*np.trace(np.dot(np.dot(S.transpose(),M),S))/(2.0*w) #rescale by network size
    else:
        output = 0
    return output

def sample_modularity(S,cf_samples,path):
    modularity_values = []
    for configuration in cf_samples:
            modularity_values.append(Qmat(S,configuration,path))  
    return modularity_values

def random_community(Sp,configuration,path):
    np.random.shuffle(Sp)
    return Qmat(Sp,configuration,path)

def membership(factors):
    S = np.zeros(shape=factors[1][1].shape)
    for c in range(S.shape[1]):
        vec = 0.5*(factors[1][1][:,c]+factors[1][2][:,c]) # take the average of the 2 factors, that should be identical
        S[:,c] = vec/tl.norm(vec,1) #normalize membership
    Sp = np.copy(S) # this copy can be used for significance testing
    return S,Sp

def membership_array(path,S,comm,samples,thresh): #gives pairs property for a given community on a sampled dataset
    
    listPairs = []
    listaMem1=[];listaMem2=[]
    N = S.shape[0]
    for bead1 in range(N-1):
        for bead2 in range(bead1+1,N):
            m1=S[bead1,comm]
            m2=S[bead2,comm]
            if m1*m2>0 and m1>thresh and m2>thresh: #set threshold on membership
                listPairs.append((bead1,bead2))
                listaMem1.append(m1)
                listaMem2.append(m2)
                
    listaD=[]
    f = []
    for (dirpath, dirnames, filenames) in os.walk(path):
        for dirname in dirnames:
            if dirname.startswith('cf_'):
                f.append(os.path.join(dirpath,dirname,'coords.csv'))

#     sample_list = random.sample(f,samples)
    sample_list = f[:samples]
    count = 0
    for coordfile in sample_list:
        count+=1
        print('\r', 'Iteration', count, 'of', str(samples),end='')
        
        listd=[]
        with open(coordfile, newline='') as csvfile:
                xyz = np.asfarray(list(csv.reader(csvfile)),float)[:,:3]
        for pair in listPairs:

            bead1=pair[0]; bead2=pair[1]
            b1=xyz[bead1,:]
            b2=xyz[bead2,:]
            m1=S[bead1,comm]
            m2=S[bead2,comm]
            d = np.linalg.norm(b1-b2)
            listd.append(d)
            
        listaD.append(listd)
    return listaMem1, listaMem2, list(zip(*listaD)), listPairs


def Dmat(S,A):
    M = A
    k = A.sum(axis=0)
    w = A.sum(axis=None)
    M = A - np.outer(k,k)*0.5/w
    output = M.shape[0]*M.shape[1]*np.trace(np.dot(np.dot(S.transpose(),M),S))/(2.0*w) #rescale by networksize
    return output

def Dmat_2(C,A):
    M = A
    k = A.sum(axis=0)
    w = A.sum(axis=None)
    M = A - np.outer(k,k)*0.5/w
    output = M.shape[0]*M.shape[1]*np.sum(np.multiply(M,C))/(2.0*w) #rescale by networksize
    return output

def D_random_community(Sp,A):
    np.random.shuffle(Sp)
    return Dmat(Sp,A)

with open('../csv/chrs.csv', newline='') as csvfile:
    chroms = list(csv.reader(csvfile,delimiter='\t'))

def roll_model(S,shift):
    lista = np.hstack((np.asarray(chroms),S))

    #split by chromosomes
    values = set(map(lambda x:x[0], lista))
    newlist = [[c[:] for c in lista if c[0]==x] for x in values]

    rollo = []
    for chromosome in range(len(newlist)):
        rollo.append(np.roll(np.asarray(newlist[chromosome]),shift=shift,axis=0))
    return np.asfarray(np.vstack(rollo)[:,2:])

def interchr_community(S,community): #given modules S and a given community removes the intrachr weights
    mat = np.outer(S[:,community],S[:,community])
    intermat = np.zeros(mat.shape)
    for r in range(mat.shape[0]): 
        for c in range(r+1,mat.shape[1]): 
             if chroms[r][0] != chroms[c][0]:
                    intermat[r,c] = mat[r,c]
                    intermat[c,r] = intermat[r,c]
    return intermat

In [9]:
import pickle as pkl

samples = 100
rank = 25 
numb_comm = 30 
ind = 3
factor_list = []
path = '/media/garner1/hdd1/gpseq/10000G'

fileName = '/media/garner1/hdd1/gpseq/info_10000G/nnparafac_WOintraChrom' + '_rank' + str(rank) + '_sample' + str(ind) + '_size' + str(samples) + '.pkl'
fileObject = open(fileName, 'rb')
factors = pkl.load(fileObject)
fileObject.close()

fileName = '/media/garner1/hdd1/gpseq/info_10000G/cf-sampled_WOintraChrom' + '_rank' + str(rank) + '_sample' + str(ind) + '_size' + str(samples) + '.pkl'
fileObject = open(fileName, 'rb')
config_sample = pkl.load(fileObject)
fileObject.close()

In [10]:
S, Sp = membership(factors)
# with open('gpseq_rank25_samples100.pkl', 'wb') as f:
#     pkl.dump(S, f)

np.savetxt("gpseq.csv", S, delimiter=",")
# np.savetxt("hic_rank20_samples1000.csv", S, delimiter=",")

In [None]:
'''evaluate the distribution of modularity for each community on the training data'''
h1 = [sample_modularity(S[:,comm],config_sample[:100],path) for comm in range(S.shape[1])]

In [None]:
'''evaluate the distribution of modularity for each community on the test data'''
h2 = [[sample_modularity(S[:,comm],random.sample(os.listdir(path), k=100),path)] for comm in range(S.shape[1])]

In [None]:
hmean=[np.mean(h) for h in h1]
top=np.asfarray(hmean).argsort()[-numb_comm:][::-1]
print(top)
print([hmean[t] for t in top])

In [None]:
hmean=[np.mean(h) for h in h2]
top=np.asfarray(hmean).argsort()[-numb_comm:][::-1]
print(top)
print([hmean[t] for t in top])

In [None]:
sns.set(rc={'figure.figsize':(10,10)})
labels = [str(comm) for comm in range(S.shape[1])]
fig, ax = plt.subplots()
for count in range(S.shape[1]):
    sns.distplot(h2[count], rug=True, hist=False,label=labels[count])
    
    
# plt.legend()
# plt.title('HiC+GPSeq only with model significance '+str(np.round((mu_test-mu_null)/(sigma_test+sigma_null))))
# print('z-score is: '+str((ref-mu)/sigma))  # z-score for a random S as the null model

In [None]:
sns.set(rc={'figure.figsize':(10,10)})
labels = [str(comm) for comm in range(S.shape[1])]
fig, ax = plt.subplots()
for count in range(S.shape[1]):
    sns.distplot(h1[count], rug=True, hist=False,label=labels[count])
    
    
# plt.legend()
# plt.title('HiC+GPSeq only with model significance '+str(np.round((mu_test-mu_null)/(sigma_test+sigma_null))))
# print('z-score is: '+str((ref-mu)/sigma))  # z-score for a random S as the null model

In [None]:
hh1 = sample_modularity(S,config_sample,path) # S on the training data
hh2 = sample_modularity(S,random.sample(os.listdir(path), k=100),path) # S on the test data
mu_test = np.mean(hh2); sigma_test = np.std(hh2)
hh3 = [random_community(Sp,config_sample[ind],path) for ind in range(100)] # random S on one of the training data
mu_null = np.mean(hh3); sigma_null = np.std(hh3)

sns.set(rc={'figure.figsize':(10,10)})
labels = ['training data','test data', 'null model']
histos = [hh1,hh2,hh3] #[h1,h2,h3]
with open('with-gpseq-histos.pkl', 'wb') as f:
    pkl.dump(histos, f)
    
fig, ax = plt.subplots()
for count in range(3):
#     sns.distplot(histos[count], kde=False,norm_hist=True,label=labels[count],hist_kws=dict(alpha=0.7))
    sns.distplot(histos[count], rug=True, hist=False,label=labels[count])
    
plt.legend()
plt.title('HiC+GPSeq only with model significance '+str(np.round((mu_test-mu_null)/(sigma_test+sigma_null))))
# print('z-score is: '+str((ref-mu)/sigma))  # z-score for a random S as the null model

In [None]:
for i in top:
    a = factors[1][1][:,i]
    b = factors[1][2][:,i]
    c = factors[1][0][:,i]
    mat = np.outer(S[:,i],S[:,i]) # symmetrize wrt a & b
    print(str(i),str(tl.norm(a,2)*tl.norm(b,2)*tl.norm(c,2)))
#     plt.imshow(mat, cmap='Blues', interpolation='nearest')
    plt.figure(figsize=(10, 10))
    sns.heatmap(mat,cmap='Blues',square=True,xticklabels=False,yticklabels=False)
    plt.show()  


In [None]:
import pandas as pd
import plotly
import plotly.graph_objects as go

N = S.shape[0]
for i in range(numb_comm):
    a = factors[1][1][:,i]
    b = factors[1][2][:,i]
    c = factors[1][0][:,i]
    mat = N*np.outer(S[:,i],S[:,i]) # symmetrize wrt a & b
    weigth = tl.norm(a,2)*tl.norm(b,2)*tl.norm(c,2)
    print(weigth)
    lista = [ (str(chroms[r][0])+'.'+str(chroms[r][1]),str(chroms[c][0])+'.'+str(chroms[c][1]),mat[r,c]) 
             for r in range(mat.shape[0]) for c in range(mat.shape[1]) 
             if r != c and mat[r,c] > 1.0e-2 ]
    df = pd.DataFrame(lista, columns =['bead1', 'bead2', 'Score']) 
    df.bead1 = pd.to_numeric(df.bead1, errors='coerce')
    df.bead2 = pd.to_numeric(df.bead2, errors='coerce')
    df.sort_values(['bead1','bead2'],ascending=[False, False],inplace=True)
    data = df.pivot_table(index='bead1', columns='bead2', values='Score',fill_value=0)
    
    plt.figure(figsize=(10, 10))
    fig = go.Figure(data=go.Heatmap(
                       z=data.values,
                       x=data.columns,
                       y=data.index)
                   )
    fig.update_layout(
        title='Community '+str(i)+' with weight='+str(weigth),
#         xaxis = axis_template,
#         yaxis = axis_template,
        showlegend = False,
        width = 1000, height = 1000,
        xaxis_title="bead#1 location on genome",
        yaxis_title="bead#2 location on genome",
    )
    axis_template = dict(range = [1,24], autorange = False,
                 showgrid = False, zeroline = False,
                 linecolor = 'black', showticklabels = True,ticks = '' )
    plotly.offline.plot(fig, filename='10000G_graphWOintra_community-'+str(i)+'.html',auto_open=True)

In [None]:
intS = np.sum([interchr_community(S,community) for community in range(numb_comm)])

In [None]:
intS_roll = np.sum([interchr_community(roll_model(S,10),community) for community in range(numb_comm)])

In [None]:
C = np.outer(S,S.T)

In [None]:
C_roll = np.outer(roll_model(S,1),roll_model(S,1).T)

In [None]:
distance_modularities = []
null_model = []
random_configs = random.sample(os.listdir(path), k=100)
interchr_modularities = []
interchr_null_model = []
for coordfile in random_configs:
    with open(path+'/'+coordfile+'/coords.csv', newline='') as csvfile:
            xyz = np.asfarray(list(csv.reader(csvfile)),float)[:,:3]
    coordinates_array = np.array(xyz)
    dist_array = pdist(coordinates_array)
    dist_matrix = squareform(dist_array)
    dist_matrix = dist_matrix + np.eye(dist_matrix.shape[0])
    proxy_mat = 1./dist_matrix
    proxy_mat = proxy_mat - np.eye(dist_matrix.shape[0])
    
#     distance_modularities.append(Dmat(S,proxy_mat))
#     null_model.append(Dmat(roll_model(S,1),proxy_mat))   
    distance_modularities.append(Dmat_2(C,proxy_mat))
    null_model.append(Dmat_2(C_roll,proxy_mat))   
    interchr_modularities.append(Dmat_2(intS,proxy_mat))
    interchr_null_model.append(Dmat_2(intS_roll,proxy_mat))

In [None]:
sns.set(rc={'figure.figsize':(10,10)})
fig, ax = plt.subplots()
# sns.distplot(distance_modularities, rug=True, hist=False,label='distance modularities')
sns.distplot([(S.shape[0]**2)*x for x in interchr_modularities], rug=True, hist=False,label='interchr distance modularities')
# sns.distplot(null_model, rug=True, hist=False,label='null_model')
sns.distplot([(S.shape[0]**2)*x for x in interchr_null_model], rug=True, hist=False,label='interchr null modularities')
plt.legend()
plt.title('distance_modularity')