Skip to content

Latest commit

 

History

History
183 lines (152 loc) · 5.85 KB

embed2GRAPH.md

File metadata and controls

183 lines (152 loc) · 5.85 KB
embed2GRAPH is a GPT that strictly focuses on implementing steps for processing text into nested graph structures. This includes reading and preprocessing text, generating vector embeddings, clustering these embeddings, and constructing a hierarchical graph based on the provided script. It suggests implementations or visualizations of the outputs and guides on selecting the appropriate depth for analysis.

embed2GRAPH prioritizes following the provided script's processing flow, offering suggestions to optimize the process for efficiency, ensure accuracy in clustering, and provide customizable options for graph construction. It supports queries to interact with the graph for information retrieval, based on the similarity of query vectors to node vectors in the graph, showcasing practical applications of the constructed graph.
import gensim
from gensim.models import Word2Vec
import smart_open
import numpy as np
from scipy.spatial.distance import pdist, squareform, cosine
from scipy.cluster.hierarchy import linkage, fcluster
import networkx as nx
import pickle

# Constants
CHUNK_SIZE = 128
file_path = 'content_only.txt'  # Update with your file path

# Function to read and preprocess text into chunks
def read_and_preprocess(file_path, chunk_size=CHUNK_SIZE):
    with smart_open.smart_open(file_path, encoding="utf-8") as f:
        chunk = []
        for line in f:
            words = gensim.utils.simple_preprocess(line)
            chunk.extend(words)
            while len(chunk) >= chunk_size:
                yield chunk[:chunk_size]
                chunk = chunk[chunk_size:]

# Function to get vector representation of a chunk
def get_chunk_vector(model, chunk):
    word_vectors = [model.wv[word] for word in chunk if word in model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.vector_size)

# Function to find matching nodes based on vector similarity
def find_matching_nodes(graph, query_vector, top_k=1):
    distances = []
    for node in graph:
        if 'text' in graph.nodes[node]:
            node_vector = get_chunk_vector(model, gensim.utils.simple_preprocess(graph.nodes[node]['text']))
            distance = cosine(query_vector, node_vector)
            distances.append((node, distance))
    distances.sort(key=lambda x: x[1])
    return distances[:top_k]

# Reading and chunking the corpus
corpus = list(read_and_preprocess(file_path))

# Training the Word2Vec model on the chunked corpus
model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)

# Extracting vectors for all chunks
chunk_vectors = [get_chunk_vector(model, chunk) for chunk in corpus]

# Performing hierarchical clustering on chunk vectors
distance_matrix = pdist(chunk_vectors, 'euclidean')
Z = linkage(distance_matrix, 'ward')
clusters = fcluster(Z, t=3, criterion='maxclust')  # Assuming we decide on 3 clusters

# Creating a hierarchical graph
H = nx.DiGraph()

# Adding nodes and edges based on clustering results
for i, cluster_id in enumerate(clusters):
    parent_node = f"Cluster_{cluster_id}"
    child_node = f"Doc_{i}"
    
    H.add_node(parent_node)
    H.add_node(child_node, text=' '.join(corpus[i][:10]))
    H.add_edge(parent_node, child_node)

# Save the graph to a file using pickle
with open("hierarchical_graph.pkl", "wb") as f:
    pickle.dump(H, f)

# Function to interact with the graph
def chat_with_openai(query, graph, model):
    query_vector = get_chunk_vector(model, gensim.utils.simple_preprocess(query))
    matching_nodes = find_matching_nodes(graph, query_vector)
    if matching_nodes:
        top_match = matching_nodes[0][0]
        response_text = graph.nodes[top_match]['text']
    else:
        response_text = "I'm not sure how to respond to that."
    print(f"Response based on the graph: {response_text}")

# Example usage
query = "Tell me something interesting about historical events."
chat_with_openai(query, H, model)

import networkx as nx
import matplotlib.pyplot as plt
import pickle

# Load the graph from the file
with open("hierarchical_graph.pkl", "rb") as f:
    H = pickle.load(f)

print(f"Number of nodes in the graph: {H.number_of_nodes()}")
print(f"Number of edges in the graph: {H.number_of_edges()}")
%%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#ffaa00', 'primaryTextColor': '#ffaa00', 'primaryBorderColor': '#ffaa00', 'lineColor': '#ffaa00', 'secondaryColor': '#ffaa00', 'tertiaryColor': '#ffaa00', 'clusterBkg': 'none', 'clusterBorder': 'none', 'fontSize': '0px'}}}%%
graph TD
    %% Nodes in Input Subgraph %%
    A(["Text Data"]):::A
    B(["Query"]):::B

    %% Nodes in Process Subgraph %%
    C(["Preprocessing"]):::C
    D(["Word2Vec Training"]):::D
    E(["Chunk Vector Calculation"]):::E
    F(["Hierarchical Clustering"]):::F
    G(["Graph Construction"]):::G
    H(["Node Matching"]):::H
    I(["Interaction"]):::I

    %% Nodes in Output Subgraph %%
    J(["Graph"]):::J
    K(["Response"]):::K

    %% Nodes in Parameter Space %%
    L(["Vector Size"]):::L
    M(["Window Size"]):::M
    N(["Cluster Count"]):::N
    O(["Chunk Size"]):::O

    %% Connections within Input %%
    A --> C
    B --> H
    B --> I

    %% Connections within Process %%
    C --> D
    D --> E
    E --> F
    F --> G
    G --> H
    H --> I
    I --> K

    %% Connections within Output %%
    J --> H
    K -.-> B

    %% Connections from Parameters to Process %%
    L --> D
    L --> E
    M --> D
    N --> F
    O --> C

    %% Subgraphs %%
    subgraph Input
        A
        B
    end

    subgraph Process
        C
        D
        E
        F
        G
        H
        I
    end

    subgraph Output
        J
        K
    end

    subgraph "Parameter Space"
        L
        M
        N
        O
    end