In [None]:
!pip install langchain langchain-community langchain-experimental gradio neo4j pandas requests
!pip install -U langchain-ollama

In [None]:
!pip install -U langchain-neo4j

In [None]:
from torch_geometric.data import HeteroData
from collections import defaultdict
import random
import numpy as np
from datetime import datetime
import ast
import re
from langchain.chains import GraphCypherQAChain
import torch.nn.functional as F
from torch_geometric.nn import (to_hetero, GraphConv, GATConv, GCNConv, SAGEConv, GATv2Conv, Linear, HeteroConv, HGTConv, RGCNConv, RGATConv, MessagePassing, global_add_pool)
from torch_geometric.loader import NeighborLoader
import torch_geometric.transforms as T
from torch_geometric.explain import GNNExplainer
import torch_geometric
import pyg_lib
import torch_sparse
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from torch_geometric.utils import degree


import os
import requests
import pandas as pd
from neo4j import GraphDatabase

from langchain_neo4j import Neo4jGraph
from langchain.docstore.document import Document

from langchain_text_splitters import TokenTextSplitter
from langchain_ollama import OllamaLLM
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Neo4jVector

from langchain.chains import RetrievalQA
import gradio as gr
from dotenv import load_dotenv
import torch
import networkx as nx
print("PyTorch Version:", torch.__version__)
print("PyG Loaded Successfully!")

In [2]:
# Load environment variables from .env file
load_dotenv()

# Set seed for reproducibility
def set_seed(seed_value=24):
    torch.manual_seed(seed_value)
    random.seed(seed_value)
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    np.random.seed(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(24)

In [3]:
PTH = os.getenv("PTH")

x = os.getenv("user_profile")

# bartala
NEO4J_USERNAME = os.getenv("NEO4J_USER")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE")

NEO4J_URI = os.getenv("NEO4J_URI_"+x)
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD_"+x)

print(NEO4J_URI)

neo4j+s://5f5434fc.databases.neo4j.io


In [4]:
graph = Neo4jGraph(
    url = NEO4J_URI,
    username = NEO4J_USERNAME,
    password = NEO4J_PASSWORD,
    database = NEO4J_DATABASE
)

In [5]:

graph.query("""
CREATE CONSTRAINT unique_document IF NOT EXISTS
FOR (d:Document) REQUIRE d.id IS UNIQUE
""")


graph.query("""
CREATE VECTOR INDEX document_embedding_index IF NOT EXISTS
FOR (d:Document) ON (d.textEmbedding)
OPTIONS {
  indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
  }
}
""")

[]

In [None]:
# Load into DataFrame
df = pd.read_csv(os.path.join(PTH,'AIVI_HI.csv'))
df['id'] = range(1, len(df) + 1)
df = df[df['posts.comments.text'].notna()]
df

In [15]:
user_comments = df[df['text.comments.text'].notna()].copy()
user_comments.loc[:, 'user_or_influencer'] = 'USER'

influencers_posts = df[df['posts.comments.text'].notna() & df['text.comments.text'].isna()].copy()
influencers_posts.loc[:, 'user_or_influencer'] = 'INFLUENCER'

In [18]:
user_comments = user_comments[['id', 
                                        'PROFILE.url',
                                        'user_type',
                                        'pot.comment.likes_count', 
                                        'posts.time',
                                        'posts.comments.user', 
                                        'text.comments.text',
                                        'user_or_influencer',                                       
                                        ]]

user_comments.columns = ['id',
                        'PROFILE.url',
                       'user_type',
                       'likes_count', 
                       'posting_time',
                       'posting_user', 
                       'text',
                       'user_or_influencer']


influencers_posts = influencers_posts[['id', 
                                         'PROFILE.url',
                                        'user_type',
                                        'posts.likes_count', 
                                        'posts.time',
                                        'posts.comments.user', 
                                        'posts.comments.text', 
                                        'user_or_influencer',                                       
                                        ]]

influencers_posts.columns = ['id',
                        'PROFILE.url',
                       'user_type',
                       'likes_count', 
                       'posting_time',
                       'posting_user', 
                       'text',
                       'user_or_influencer']


In [None]:
# Combine the two DataFrames and ignore column names to avoid conflicts
combined_df = pd.concat([user_comments, influencers_posts], ignore_index=True)

# Remove rows with missing or empty text
combined_df = combined_df[combined_df['text'].notna() & (combined_df['text'].str.strip() != '')]

In [None]:
documents = []
for index, row in combined_df.iterrows():
    doc = Document(
      page_content= row['text'],
      metadata={
           'id' : row['id'],
           'user_type' : row['user_type'],
           'likes_count' : row['likes_count'],
           'posting_time' : row['posting_time'],
           'posting_user' : row['posting_user'],
           'user_or_influencer' : row['user_or_influencer']
      }
    )
    documents.append(doc)

In [None]:
def sanitize_llm_input(documents):
    for doc in documents:
        if isinstance(doc.page_content, str):
            doc.page_content = doc.page_content.replace("[", "").replace("]", "")
    return documents

In [None]:
# Initialize the LLM
llm = Ollama(model="llama3")

# Prompt addition to enforce strict JSON output
no_none_types = (
    "IMPORTANT INSTRUCTIONS:\n"
    "- Output must be a JSON array of flat dictionaries.\n"
    "- Each dictionary MUST contain: 'head', 'head_type', 'tail', 'tail_type', and 'relation'.\n"
    "- All values MUST be plain strings — no lists, dictionaries, or nested structures.\n"
    "- If a field includes multiple values, return only one OR split them into multiple triplets.\n"
    "- Use 'Entity' as the type if unknown.\n"
    "- DO NOT include comments or explanations.\n"
    "- DO NOT hallucinate facts or make up people/places not mentioned.\n"
    "- If the input contains only emojis, punctuation, or non-language symbols, return an empty list (`[]`) and nothing else.\n"
    "- Example:\n"
    "[{{\"head\": \"user\", \"head_type\": \"Person\", \"tail\": \"Bar-Ilan\", \"tail_type\": \"Organization\", \"relation\": \"affiliated_with\"}}]"
)


llm = Ollama(model="llama3")

llm_transformer = LLMGraphTransformer(
    llm=llm,
    allowed_nodes=[],
    allowed_relationships=[],
    additional_instructions=no_none_types
)


graph_documents = llm_transformer.convert_to_graph_documents(documents[0:])

In [None]:
# Store graph documents in Neo4j
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [6]:
# Load embedding model
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Create vector index and store embeddings in Neo4j for document nodes
vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding",
    url = NEO4J_URI,
    username = NEO4J_USERNAME,
    password = NEO4J_PASSWORD,
    database = NEO4J_DATABASE
)

# Connect to the graph

In [None]:
import os
import pandas as pd
from neo4j import GraphDatabase
import networkx as nx

PTH = os.getenv("PTH")

x = os.getenv("user_profile")

# bartala
NEO4J_USERNAME = os.getenv("NEO4J_USER")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE")

NEO4J_URI = os.getenv("NEO4J_URI_"+x)
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD_"+x)

print(NEO4J_URI)

In [None]:
# Cypher query to get internal edges of the subgraph connected to HUMAN and AIVI influencers (Document nodes)

cypher_query_HI = """
MATCH (d:Document {user_type: 'HUMAN'})
WITH collect(d) AS docs
UNWIND docs AS d
MATCH (d)--(n)  // collect connected nodes
WITH collect(DISTINCT d) + collect(DISTINCT n) AS S
UNWIND S AS node
MATCH (node)-[r]-(other)
WHERE other IN S
RETURN DISTINCT startNode(r).id AS source, endNode(r).id AS target, type(r) AS relation
"""


cypher_query_AIVI = """
MATCH (d:Document {user_type: 'AI'})
WITH collect(d) AS docs
UNWIND docs AS d
MATCH (d)--(n)  // collect connected nodes
WITH collect(DISTINCT d) + collect(DISTINCT n) AS S
UNWIND S AS node
MATCH (node)-[r]-(other)
WHERE other IN S
RETURN DISTINCT startNode(r).id AS source, endNode(r).id AS target, type(r) AS relation
"""

# Function to extract data
def extract_edgelist(uri, user, password, database, query):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    with driver.session(database=database) as session:
        results = session.execute_read(lambda tx: tx.run(query).data())
    driver.close()
    return pd.DataFrame(results)

# Run the extraction
df_edgelist_HI = extract_edgelist(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD, NEO4J_DATABASE, cypher_query_HI)

df_edgelist_AIVI = extract_edgelist(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD, NEO4J_DATABASE, cypher_query_AIVI)

# Save to CSV or display
df_edgelist_HI.to_csv(os.path.join(PTH, "HI_subgraph_edgelist.csv"), index=False)
df_edgelist_AIVI.to_csv(os.path.join(PTH, "AIVI_subgraph_edgelist.csv"), index=False)

In [None]:
# Build NetworkX graph
G_HI = nx.from_pandas_edgelist(df_edgelist_HI, source='source', target='target', edge_attr='relation', create_using=nx.DiGraph())

# Basic summary
print(f"Graph has {G_HI.number_of_nodes()} nodes and {G_HI.number_of_edges()} edges.")

# Compute centrality
degree_centrality = nx.degree_centrality(G_HI)
betweenness = nx.betweenness_centrality(G_HI)

# Convert to DataFrame
centrality_df = pd.DataFrame({
    "node": list(degree_centrality.keys()),
    "degree_centrality": list(degree_centrality.values()),
    "betweenness": [betweenness[n] for n in degree_centrality.keys()]
})

# Show top 10 by degree
print(centrality_df.sort_values(by="degree_centrality", ascending=False).head(10))
