In [30]:
import pandas as pd
import numpy as np
import torch
from gensim.models import Word2Vec
from neo4j import GraphDatabase

In [4]:
uri = "bolt://localhost:7687"
username = "neo4j"
password = "12345678"

driver = GraphDatabase.driver(uri, auth=(username, password))
with driver.session() as session:
    result = session.run("MATCH (n) RETURN n.label AS label")
    label = pd.DataFrame([record.values() for record in result], columns=result.keys())

# Extract the values from the 'label' column and convert to a one-dimensional list
label_values = label['label'].tolist()

In [28]:
# Define a Word2Vec model
sentences = [str(text).split() for text in label_values]
model = Word2Vec(sentences, vector_size=32, window=5, min_count=1, sg=0)

# Function to get embeddings for a list of words
def get_sentence_embedding(word_list):
    # Get word vectors for each word in the sentence if it exists in the model's vocabulary
    word_vectors = [model.wv[word] for word in word_list if word in model.wv.key_to_index]
    
    # Combine word vectors to represent the sentence
    if word_vectors:
        sentence_embedding = sum(word_vectors)
        return sentence_embedding
    else:
        return None

# Sample DataFrame
data = {'text_data': label_values}
df = pd.DataFrame(data)

# Split the text_data column into lists of words and apply the function to each row
df['text_data'] = df['text_data'].apply(lambda x: x.split() if x is not None else [])
df['embeddings'] = df['text_data'].apply(lambda x: get_sentence_embedding(x) if x else None)

print(df)


                          text_data  \
0                       [Fluazinam]   
1    [Batang, mengalami, kerusakan]   
2            [diberikan, Pestisida]   
3               [Pelepah, membusuk]   
4              [Pengendalian, Hama]   
..                              ...   
151           [Magnaporthe, oryzae]   
152                         [Rayap]   
153                [Hawar, Pelepah]   
154         [menyebabkan, Penyakit]   
155          [diberikan, Perawatan]   

                                            embeddings  
0    [-0.004276715, -0.024777137, -0.024579354, -0....  
1    [0.026599519, 0.0012071382, 0.012941792, 0.064...  
2    [-0.015871601, 0.028990656, -0.039583363, -0.0...  
3    [0.024170345, 0.0061104465, -0.008968892, 0.02...  
4    [0.021221515, -0.0010948945, 0.0033207312, 0.0...  
..                                                 ...  
151  [-0.013065944, 0.035813827, 0.016996957, -0.02...  
152  [-0.0035325252, 0.030013237, 0.02991436, 0.025...  
153  [-0.015848285

In [29]:
df

Unnamed: 0,text_data,embeddings
0,[Fluazinam],"[-0.004276715, -0.024777137, -0.024579354, -0...."
1,"[Batang, mengalami, kerusakan]","[0.026599519, 0.0012071382, 0.012941792, 0.064..."
2,"[diberikan, Pestisida]","[-0.015871601, 0.028990656, -0.039583363, -0.0..."
3,"[Pelepah, membusuk]","[0.024170345, 0.0061104465, -0.008968892, 0.02..."
4,"[Pengendalian, Hama]","[0.021221515, -0.0010948945, 0.0033207312, 0.0..."
...,...,...
151,"[Magnaporthe, oryzae]","[-0.013065944, 0.035813827, 0.016996957, -0.02..."
152,[Rayap],"[-0.0035325252, 0.030013237, 0.02991436, 0.025..."
153,"[Hawar, Pelepah]","[-0.015848285, -0.054612987, -0.017036611, -0...."
154,"[menyebabkan, Penyakit]","[-0.0093873385, -0.03071597, -0.046768893, -0...."


In [46]:
# Define a Word2Vec model (you need to define this as you did before)
sentences = [str(text).split() for text in label_values]
model = Word2Vec(sentences, vector_size=32, window=5, min_count=1, sg=0)

# Function to get embeddings for a list of words
def get_sentence_embedding(word_list):
    word_vectors = [model.wv[word] for word in word_list if word in model.wv.key_to_index]
    
    if word_vectors:
        sentence_embedding = sum(word_vectors)
        return sentence_embedding
    else:
        return None

# Sample DataFrame
data = {'text_data': label_values}
df = pd.DataFrame(data)

# Split the text_data column into lists of words and apply the function to each row
df['text_data'] = df['text_data'].apply(lambda x: x.split() if x is not None else [])
df['embeddings'] = df['text_data'].apply(lambda x: get_sentence_embedding(x) if x else None)

print(df['embeddings'])

# # Filter out rows where embeddings are not available
# df = df.dropna(subset=['embeddings'])

# Replace rows where embeddings are not available with a default value (e.g., zeros)
default_embedding = np.zeros(32)  # Replace with your desired default value
df['embeddings'] = df['embeddings'].apply(lambda x: x if x is not None else default_embedding)

# Convert embeddings to a PyTorch tensor
embeddings_tensor = torch.tensor(df['embeddings'].to_list())

print(embeddings_tensor)


0      [-0.004276715, -0.024777137, -0.024579354, -0....
1      [0.026599519, 0.0012071382, 0.012941792, 0.064...
2      [-0.015871601, 0.028990656, -0.039583363, -0.0...
3      [0.024170345, 0.0061104465, -0.008968892, 0.02...
4      [0.021221515, -0.0010948945, 0.0033207312, 0.0...
                             ...                        
151    [-0.013065944, 0.035813827, 0.016996957, -0.02...
152    [-0.0035325252, 0.030013237, 0.02991436, 0.025...
153    [-0.015848285, -0.054612987, -0.017036611, -0....
154    [-0.0093873385, -0.03071597, -0.046768893, -0....
155    [-0.034969933, -0.01293486, -0.025607161, -0.0...
Name: embeddings, Length: 156, dtype: object
tensor([[-0.0043, -0.0248, -0.0246,  ..., -0.0244, -0.0279,  0.0063],
        [ 0.0266,  0.0012,  0.0129,  ..., -0.0236,  0.0179,  0.0374],
        [-0.0159,  0.0290, -0.0396,  ...,  0.0146,  0.0254,  0.0162],
        ...,
        [-0.0158, -0.0546, -0.0170,  ..., -0.0383, -0.0345, -0.0338],
        [-0.0094, -0.0307, -0.0468,

In [47]:
embeddings_tensor.shape

torch.Size([156, 32])