In [11]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from neo4j import GraphDatabase
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from gensim.models import Word2Vec

In [2]:
# DBN architecture
class DBN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(DBN, self).__init__()
        self.rbm1 = RBM(input_dim, hidden_dim)
        self.rbm2 = RBM(hidden_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = torch.sigmoid(self.rbm1(x))
        x = torch.sigmoid(self.rbm2(x))
        x = self.fc(x)
        return x

# RBM layer
class RBM(nn.Module):
    def __init__(self, visible_dim, hidden_dim):
        super(RBM, self).__init__()
        self.W = nn.Parameter(torch.randn(visible_dim, hidden_dim))
        self.visible_bias = nn.Parameter(torch.randn(visible_dim))
        self.hidden_bias = nn.Parameter(torch.randn(hidden_dim))
    
    def forward(self, x):
        p_hidden_given_visible = torch.sigmoid(torch.matmul(x, self.W) + self.hidden_bias)
        sampled_hidden = torch.bernoulli(p_hidden_given_visible)
        p_visible_given_hidden = torch.sigmoid(torch.matmul(sampled_hidden, self.W.t()) + self.visible_bias)
        return p_visible_given_hidden

In [108]:
# Connect to Neo4j and retrieve knowledge graph vectors
class Neo4jDataLoader:
    def __init__(self, uri, user, password):
        self._driver = GraphDatabase.driver(uri, auth=(user, password))
    
    def get_vectors(self):
        with self._driver.session() as session:
            # query = "MATCH (node:Entity) RETURN node.vector AS vector"
            query = "MATCH (n) RETURN n.Vector AS vector"
            result = session.run(query)
            vectors = [record['vector'] for record in result]
        return torch.tensor(vectors)
neo4j_loader = Neo4jDataLoader(uri="neo4j://localhost:7687", user="neo4j", password="12345678")
data = neo4j_loader.get_vectors()


In [None]:
uri = "bolt://localhost:7687"
username = "neo4j"
password = "12345678"

driver = GraphDatabase.driver(uri, auth=(username, password))
with driver.session() as session:
    result = session.run("MATCH (n) RETURN n.label AS label")
    label = pd.DataFrame([record.values() for record in result], columns=result.keys())
    
# Extract the values from the 'label' column and convert to a one-dimensional list
label_values = label['label'].tolist()

In [30]:
data

tensor([[-0.0035,  0.0155,  0.0005,  ..., -0.0087, -0.0016, -0.0086],
        [-0.0035,  0.0107, -0.0042,  ...,  0.0103,  0.0054,  0.0011],
        [-0.0031, -0.0156,  0.0136,  ...,  0.0139, -0.0053,  0.0154],
        ...,
        [-0.0033, -0.0010,  0.0074,  ...,  0.0136,  0.0032,  0.0089],
        [-0.0031,  0.0121, -0.0047,  ...,  0.0047, -0.0100, -0.0060],
        [-0.0033,  0.0016, -0.0095,  ..., -0.0031,  0.0126, -0.0039]])

In [149]:
# Define a Word2Vec model (you need to define this as you did before)
sentences = [str(text).split() for text in label_values]
model = Word2Vec(sentences, vector_size=1, window=5, min_count=1, sg=0)

# Function to get embeddings for a list of words
def get_sentence_embedding(word_list):
    word_vectors = [model.wv[word] for word in word_list if word in model.wv.key_to_index]
    
    if word_vectors:
        sentence_embedding = sum(word_vectors)
        return sentence_embedding
    else:
        return None

# Sample DataFrame
data = {'text_data': label_values}
df = pd.DataFrame(data)

# Split the text_data column into lists of words and apply the function to each row
df['text_data'] = df['text_data'].apply(lambda x: x.split() if x is not None else [])
df['embeddings'] = df['text_data'].apply(lambda x: get_sentence_embedding(x) if x else None)

print(df['embeddings'])

# # Filter out rows where embeddings are not available
# df = df.dropna(subset=['embeddings'])

# Replace rows where embeddings are not available with a default value (e.g., zeros)
default_embedding = np.zeros(1)  # Replace with your desired default value
df['embeddings'] = df['embeddings'].apply(lambda x: x if x is not None else default_embedding)

# Convert embeddings to a PyTorch tensor
embeddings_tensor = torch.tensor(df['embeddings'].to_list())

print(embeddings_tensor)


0         [0.4884479]
1        [0.87393254]
2        [0.21472734]
3        [0.55303454]
4       [-0.42872798]
            ...      
151      [0.20372128]
152    [-0.071162105]
153      [0.93546486]
154      [-1.2053802]
155      [-1.5844753]
Name: embeddings, Length: 156, dtype: object
tensor([[ 4.8845e-01],
        [ 8.7393e-01],
        [ 2.1473e-01],
        [ 5.5303e-01],
        [-4.2873e-01],
        [ 1.9503e-01],
        [-8.9595e-01],
        [ 9.1032e-01],
        [-4.1371e-01],
        [-3.5485e-01],
        [ 4.4958e-01],
        [-8.1579e-01],
        [-9.2834e-01],
        [ 1.2550e+00],
        [ 9.0775e-01],
        [ 3.2137e-02],
        [-1.4537e-01],
        [ 5.0606e-01],
        [ 1.1201e-01],
        [ 6.7674e-01],
        [ 1.3520e-01],
        [ 1.3897e+00],
        [ 9.5477e-01],
        [-8.8707e-01],
        [ 8.8868e-01],
        [ 3.1072e-01],
        [ 3.4036e-02],
        [ 2.8477e-01],
        [-1.8087e-01],
        [-8.1328e-02],
        [ 3.0743e-01],


In [150]:
embeddings_tensor.shape

torch.Size([156, 1])

In [154]:
# Main training loop
def train_dbn():
    # Initialize DBN and other hyperparameters
    input_dim = 32  # Adjust based on the dimensionality of your knowledge graph vectors
    hidden_dim = 32
    output_dim = 2  # Adjust based on your task (e.g., classification)

    dbn = DBN(input_dim, hidden_dim, output_dim)
    optimizer = optim.Adam(dbn.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    # Connect to Neo4j and retrieve data
    neo4j_loader = Neo4jDataLoader(uri="neo4j://localhost:7687", user="neo4j", password="12345678")
    data = neo4j_loader.get_vectors()

    # Load labels for your data
    labels = embeddings_tensor

    # Create a DataLoader to handle batching (if needed)
    batch_size = 32  # Adjust based on your dataset size
    dataset = TensorDataset(data, labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Training loop
    for epoch in range(100):  # Adjust the number of epochs
        total_loss = 0.0
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            output = dbn(inputs)
            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/100], Loss: {total_loss / len(dataloader)}")

if __name__ == "__main__":
    train_dbn()

RuntimeError: 0D or 1D target tensor expected, multi-target not supported