In [1]:
import os
import torch
import time
import torch.nn as nn
from cell import utils, analysis, plot_utils
from torch.autograd import Variable
from torch.nn import functional as F
import matplotlib.pyplot as plt

from cell.Word2vec import prepare_vocab, dataloader, wv

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
length = 10000
p = 1
q = 1
N = 1
batch_size = 2000
walk_filename = "walk_node21_32_removed.csv"
roi = "VISp"
project_name = "NPP_GNN_project"
layer_class = "single_layer"
layer = "base_unnormalized_allcombined"
walk_type= "Directed_Weighted_node2vec"
window = 2

In [3]:
datasets = {}

for (layer, walk_filename) in [("base_unnormalized_allcombined", "walk_node21_32_removed.csv"),
                               ("Sst-Sstr1", "walk_0.csv"),
                               ("Sst-Sstr2", "walk_0.csv"),
                               ("Vip-Vipr1", "walk_0.csv"),
                               ("Vip-Vipr2", "walk_0.csv")]:
    
    walk_dir = utils.get_walk_dir(roi,
                                  project_name, 
                                  N, 
                                  length, 
                                  p, 
                                  q, 
                                  layer_class, 
                                  layer, 
                                  walk_type) 
    path = os.path.join(walk_dir, walk_filename)
    corpus = utils.read_list_of_lists_from_csv(path)
    vocabulary = prepare_vocab.get_vocabulary(corpus)
    
    print(f'lenght of vocabulary: {len(vocabulary)}')
    
    word_2_index = prepare_vocab.get_word2idx(vocabulary, padding=True)
    index_2_word = prepare_vocab.get_idx2word(vocabulary, padding=True)
    datasets[layer] = [word_2_index]
    datasets[layer].append(index_2_word)
    
    tuples = prepare_vocab.MCBOW_get_word_context_tuples(corpus, window=window)
    dataset = dataloader.MCBOW_WalkDataset(tuples, word_2_index)
    datasets[layer].append(dataset)
    
    datasets[layer].append(len(vocabulary))

lenght of vocabulary: 91
a node called pad is added for padding and its index is zero
a node called pad is added for padding and its index is zero
MCBOW by default adds a padding node called pad with index zero
There are 910000 pairs of target and context words
lenght of vocabulary: 89
a node called pad is added for padding and its index is zero
a node called pad is added for padding and its index is zero
MCBOW by default adds a padding node called pad with index zero
There are 890000 pairs of target and context words
lenght of vocabulary: 91
a node called pad is added for padding and its index is zero
a node called pad is added for padding and its index is zero
MCBOW by default adds a padding node called pad with index zero
There are 910000 pairs of target and context words
lenght of vocabulary: 91
a node called pad is added for padding and its index is zero
a node called pad is added for padding and its index is zero
MCBOW by default adds a padding node called pad with index zero
The

In [4]:
def get_node_intersections(datasets, base_layer_name):
    node_intersections = {}
    for k, v in datasets.items():
        l1 = set(datasets[k][0])
        l2 = set(datasets[base_layer_name][0])
        node_intersections[k] = set(l1).intersection(l2)
    
    return node_intersections

In [6]:
base_layer_name = "base_unnormalized_allcombined"
layers = ["Sst-Sstr1", "Sst-Sstr2", "Vip-Vipr1", "Vip-Vipr2"]

node_intersections = get_node_intersections(datasets, base_layer_name)
node_intersections.keys()

dict_keys(['base_unnormalized_allcombined', 'Sst-Sstr1', 'Sst-Sstr2', 'Vip-Vipr1', 'Vip-Vipr2'])

In [7]:
class ConcatDataset(torch.utils.data.Dataset):
    def __init__(self, *datasets):
        self.datasets = datasets

    def __getitem__(self, i):
        return tuple(d[i] for d in self.datasets)

    def __len__(self):
        return min(len(d) for d in self.datasets)

In [8]:
datasets.keys()

dict_keys(['base_unnormalized_allcombined', 'Sst-Sstr1', 'Sst-Sstr2', 'Vip-Vipr1', 'Vip-Vipr2'])

In [9]:
def build_data_loader(datasets, batch_size, shuffle=True, drop_last=True, num_workers=1):
    data_loader = torch.utils.data.DataLoader(
        ConcatDataset(*[datasets[k][2] for k in datasets.keys()]),
        batch_size=batch_size, 
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers)
    return {k:i for i,k in enumerate(datasets.keys())}, data_loader

In [10]:
arm_keys, data_loader = build_data_loader(datasets, batch_size=2000, shuffle=False)

In [13]:
for batch_idx, (data1, data2, data3, data4, data4) in enumerate(data_loader):
    print(data4)
    break

[tensor([ 5, 20, 41,  ..., 65, 70, 65]), tensor([[20, 41,  0,  0],
        [ 5, 41, 87,  0],
        [ 5, 20, 87, 65],
        ...,
        [65, 65, 70, 65],
        [65, 65, 65, 70],
        [65, 70, 70, 65]])]


### Take care of index in different arms and different number of nodes in different arms

In [14]:
arm_keys

{'base_unnormalized_allcombined': 0,
 'Sst-Sstr1': 1,
 'Sst-Sstr2': 2,
 'Vip-Vipr1': 3,
 'Vip-Vipr2': 4}

In [115]:
v_0 = pd.DataFrame(torch.stack(emb[0]).detach().numpy(), 
                   index=datasets['base_unnormalized_allcombined'][1].values())

v_1 = pd.DataFrame(torch.stack(emb[1]).detach().numpy(), 
                   index=datasets['Sst-Sstr1'][1].values())

v_0.index.name = "cluster_id"
v_1.index.name = "cluster_id"

merged = v_1.merge(v_0, on='cluster_id')
v_0 = merged[['0_x', '1_x']]
v_1 = merged[['0_y', '1_y']]

v_0 = torch.tensor(np.array(v_0))
v_1 = torch.tensor(np.array(v_1))
F.mse_loss(v_0, v_1)

tensor(1.9469)

In [124]:
loss_joint = 0 

base_arm = arm_keys[base_layer_name]
for arm, (k, v) in enumerate(arm_keys.items()):
    print(arm, k, v)
    idx0 = [datasets[base_layer_name][0][i] for i in node_intersections[k]]
    idx1 = [datasets[k][0][i] for i in node_intersections[k]]
    loss_joint += F.mse_loss(torch.index_select(input=torch.stack(emb[v]), 
                                                dim=0, 
                                                index=torch.tensor(idx1), 
                                                out=None),
                             torch.index_select(input=torch.stack(emb[base_arm]), 
                                                dim=0, 
                                                index=torch.tensor(idx0), 
                                                out=None))
print(loss_joint)

0 base_unnormalized_allcombined 0
1 Sst-Sstr1 1
2 Vip-Vipr1 2


In [15]:
def loss_CMCBOW(prediction, target, emb, arm_keys, base_layer_name, node_intersections, n_arm=2):
    
    base_arm = arm_keys[base_layer_name]
    loss_indep = [None] * n_arm
    loss_joint = [None] * n_arm
    
    for arm, (k, v) in enumerate(arm_keys.items()):
        
        loss_indep[arm] = F.cross_entropy(prediction[arm], target[arm])
        
        idx0 = [datasets[base_layer_name][0][i] for i in node_intersections[k]]
        idx1 = [datasets[k][0][i] for i in node_intersections[k]]
        loss_joint[arm] = F.mse_loss(torch.index_select(input=torch.stack(emb[v]), 
                                                    dim=0, 
                                                    index=torch.tensor(idx1),
                                                    out=None),
                                 torch.index_select(input=torch.stack(emb[base_arm]), 
                                                    dim=0, 
                                                    index=torch.tensor(idx0), 
                                                    out=None))
    loss = sum(loss_indep) + sum(loss_joint)

    return loss

In [239]:
from torch.nn import functional as F

i = 10
arm = 2
print(F.cross_entropy(predict[arm][[i]], target_data[arm][[i]]))

sf = F.softmax(predict[arm][i], dim=0)
loss = -1 * torch.log(sf)
print(loss[target_data[arm][i]])

tensor(0.2999, grad_fn=<NllLossBackward>)
tensor(0.2999, grad_fn=<SelectBackward>)


### Coupled MCBOW_Word2Vec

In [16]:
class CMCBOW_Word2Vec(nn.Module):
    """
    """
    def __init__(self, vocab_size=[93], embedding_size=2, n_arm=1, padding_idx=0):
        """
        """
        super(CMCBOW_Word2Vec, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.n_arm = n_arm
        
        self.embeddings = nn.ModuleList([nn.Embedding(vocab_size[i],
                                                      embedding_size, 
                                                      padding_idx=padding_idx) 
                                         for i in range(n_arm)])
        
        self.linear = nn.ModuleList([nn.Linear(embedding_size,
                                               vocab_size[i]) 
                                     for i in range(n_arm)])
        
        self.batch_norm = nn.ModuleList([nn.BatchNorm1d(num_features=embedding_size,
                                                        eps=1e-10, 
                                                        momentum=0.1, 
                                                        affine=False) 
                                         for i in range(n_arm)])
                        

    def encoder(self, context_words, arm):
        h1 = torch.mean(self.embeddings[arm](context_words), dim=1)
        node_embeddings = [self.embeddings[arm](torch.tensor(i)) for i 
                           in range(self.vocab_size[arm])]
        return node_embeddings ,h1

    def decoder(self, mean_context, arm):
        h2 = self.linear[arm](self.batch_norm[arm](mean_context))
        return h2

    def forward(self, context_words):
        emb = [None] * self.n_arm
        predictions = [None] * self.n_arm

        for arm in range(self.n_arm):
            node_embeddings , mean_context = self.encoder(context_words[arm], arm)
            emb[arm] = node_embeddings
            predictions[arm] = self.decoder(mean_context, arm)
            
        return emb, predictions


In [17]:
embedding_size = 2
learning_rate = 0.001
n_epochs = 10
n_arm=5

In [18]:
model = CMCBOW_Word2Vec(embedding_size=embedding_size, 
                        vocab_size=[v[3] + 1 for (k, v) in datasets.items()],
                        n_arm=n_arm, 
                        padding_idx=0).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
training_loss = []


for epoch in range(n_epochs):
    losses = []
    t0 = time.time()
    for batch_idx, all_data in enumerate(data_loader):
        target_data = [data[0].to(device) for data in all_data]
        context_data = [data[1].to(device) for data in all_data]
        optimizer.zero_grad()
        emb, predict = model(context_data)
        loss = loss_CMCBOW(prediction=predict, 
                           target=target_data, 
                           arm_keys=arm_keys, 
                           emb=emb, 
                           n_arm=n_arm, 
                           base_layer_name=base_layer_name, 
                           node_intersections=node_intersections) 
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        
    t1 = time.time()
    print('time is %.2f' % (t1 - t0))
        
    training_loss.append(np.mean(losses)) 
    print(f'epoch: {epoch+1}/{n_epochs}, loss:{np.mean(losses):.4f}')

time is 86.23
epoch: 1/10, loss:26.2459
time is 83.36
epoch: 2/10, loss:20.0118
time is 88.05
epoch: 3/10, loss:16.0191
time is 86.11
epoch: 4/10, loss:13.2241
time is 86.24
epoch: 5/10, loss:11.2717
time is 83.98
epoch: 6/10, loss:9.9709
time is 83.05
epoch: 7/10, loss:9.1404
time is 85.70
epoch: 8/10, loss:8.6181
time is 85.64
epoch: 9/10, loss:8.2840
time is 92.73
epoch: 10/10, loss:8.0616


In [227]:
target_data

[tensor([91, 46, 56,  ..., 84, 85, 86]),
 tensor([30, 30, 30,  ..., 30, 30, 30]),
 tensor([60, 60, 60,  ..., 60, 60, 60])]

In [221]:
predict[0][0]

tensor([-3.2212e+00,  1.4648e-01,  4.6296e-01, -8.1406e-01, -7.9877e-01,
         7.9398e-02, -1.7338e-02, -5.3285e-02,  4.0035e-01, -4.6990e-01,
        -9.0508e-02,  2.4932e-02, -4.7801e-01,  3.3176e-01, -2.6811e-01,
         1.6318e-01, -3.5090e-01,  2.2714e-01, -1.0808e-01,  3.8041e-01,
        -4.2286e-01,  1.0568e-01,  4.3074e-01, -2.8477e-01, -6.2946e-02,
         4.5789e-01,  1.0429e-01,  5.0718e-01,  4.7870e-01, -6.0068e-01,
        -3.2283e-01,  6.0317e-02, -7.5750e-02,  3.2986e-01, -8.0745e-01,
        -1.2288e-01, -1.6709e-01,  1.2254e-01, -5.9767e-02,  6.6194e-01,
        -2.5543e-01, -1.3109e-01, -7.4612e-01, -3.3035e-01, -4.0580e-01,
        -6.0866e-02,  4.6160e-01,  1.0293e-01, -9.0680e-02,  3.8162e-01,
        -3.0542e-01,  3.0129e-01, -5.2409e-01, -1.0371e+00, -5.6274e-01,
        -4.0831e-01, -5.5897e-01,  3.0685e-01,  1.1984e-02, -1.3241e-01,
        -1.4466e-01, -4.0001e-01,  2.8036e-01,  6.7843e-01,  6.2587e-01,
         1.6307e-01, -5.7426e-01, -4.5189e-01, -3.1

In [132]:
cldf = utils.read_visp_npp_cldf()
vectors = model.embeddings[0].weight.detach().numpy()

data = analysis.summarize_walk_embedding_results(gensim_dict={"model": vectors},
                                                 index=index_2_word.values(),
                                                 ndim=2, 
                                                 cl_df=cldf, 
                                                 padding_label="pad")


Reading cldf from: //Users/fahimehb/Documents/NPP_GNN_project/dat/cl_df_VISp_annotation.csv


In [136]:
model_dir = utils.get_model_dir(project_name, 
                                roi, 
                                N, 
                                length, 
                                p, 
                                q, 
                                layer_class, 
                                layer, 
                                walk_type)

model_name = utils.get_model_name(size=embedding_size, 
                                  iter=n_epochs, 
                                  window=2, 
                                  lr=learning_rate, 
                                  batch_size=batch_size,
                                  opt_add="test")

In [137]:
data.to_csv(os.path.join(model_dir, model_name))

FileNotFoundError: [Errno 2] No such file or directory: '//Users/fahimehb/Documents/NPP_GNN_project/models/VISp/single_layer/Directed_Weighted_node2vec/N_1_l_10000_p_1_q_1/Vip-Vipr1/model_size_2_iter_10_window_2_lr_0.001_bs_2000_test.csv'

In [134]:
model_name

'model_size_2_iter_10_window_2_lr_0.001_bs_2000_test.csv'