In [6]:
import pandas as pd
import numpy as np
import os
from langchain_community.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain_community.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
import random

### First experiment - load only one document


In [19]:
# First experiment - load only one document

from langchain_community.document_loaders import TextLoader
splitter = RecursiveCharacterTextSplitter(
    chunk_size=4500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

from df_helpers import *



In [None]:
import uuid
import pandas as pd
import numpy as np

import requests
import json

#pip install -q -U google-generativeai

import google.generativeai as genai


def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df

def graphPrompt3(input: str, metadata={}):
    
    GOOGLE_API_KEY='AIzaSyC-Oev_8ZghBDAk-1INW9WuaGf7W86oKS8'

    genai.configure(api_key=GOOGLE_API_KEY)
    
    prompt = f'''
        **Instructions:**

        You are a network graph maker who extracts entities and relationships from a provided text (input) and use them to build a Knowledge Graph.

        1. Carefully read the provided text to identify mentioned entities. Entities can be people, places, organizations, events, concepts, etc.Only consider the name of the most semantically relevant identified entities. Ignore the referencies/citations along the text.

        2. Identify concise, meaningfull relationships between the mentioned entities in the text. Relationships can be actions, associations, properties, etc. Only consider the most semantically relevant relationships between entities.

        3. Based on the identified entities and relationships, construct a Knowledge Graph representing entities as nodes and relationships as edges.

        4. Take care of your output token limit. Be concise in your output and strictly comply with the expected output structure (list of JSON objects).
        
        **Example:**
        
        *Context (input):*
            "Albert Einstein was born in Ulm, in the Kingdom of Württemberg, in the German Empire, on March 14, 1879. He was a theoretical physicist, best known for developing the theory of relativity (Silva et al., Rodrigues et al), one of the two pillars of modern physics."

        *Expected Output (list of JSON objects):*
            [
                {{"node_1": "Albert Einstein", "node_2": "Ulm", "edge": "born in"}},
                {{"node_1": "Ulm", "node_2": "Kingdom of Württemberg", "edge": "located in"}},
                {{"node_1": "Kingdom of Württemberg", "node_2": "German Empire", "edge": "located in"}},
                {{"node_1": "Albert Einstein", "node_2": "March 14, 1879", "edge": "date of"}},
                {{"node_1": "Albert Einstein", "node_2": "Theory of relativity", "edge": "known for"}},
                {{"node_1": "Theory of relativity", "node_2": "Modern physics", "edge": "one of the two pillars of"}}
            ]

        *Context:*
            "{input}"

        *Expected Output (list of JSON objects):*

    '''
    
    model = genai.GenerativeModel()
    response = model.generate_content(prompt)
    try:
        result = json.loads(response.text)
        result = [dict(item) for item in result]
    except:
        print("\n\nERROR ### Here is the buggy response: ", response, "\n\n")
        result = None
        
    return result

def df2Graph(dataframe: pd.DataFrame, model=None) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.apply(
        lambda row: graphPrompt3(row.text, {"chunk_id": row.chunk_id}), axis=1
    )
    # invalid json results in NaN
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list

def graph2Df(nodes_list) -> pd.DataFrame:
    ## Remove all NaN entities
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

    return graph_dataframe



In [20]:
loader = TextLoader("semantic/f2422c065255c413e57b04fc2ef215f1d6ac841c.txt", encoding="utf-8")
load_doc = loader.load()
texts = splitter.split_documents(load_doc)
df = documents2Dataframe(texts)
df_copy = df.copy()

concepts_list = df2Graph(df_copy)
dfg1 = graph2Df(concepts_list)
dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1.to_csv("out/"+"test.csv",sep="|", index=False)
#df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)


In [30]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df_copy)
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv("out/"+"test.csv",sep="|", index=False)
    #df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
#dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

PermissionError: [Errno 13] Permission denied: 'out/test.csv'

In [249]:
dfg1.tail(15)

Unnamed: 0,node_1,node_2,edge
309,lucia rotaris,formal analysis,contribution to
310,lucia rotaris,investigation,contribution to
311,lucia rotaris,writing - original draft,contribution to
312,marco giansoldati,investigation,contribution to
313,marco giansoldati,data curation,contribution to
314,marco giansoldati,writing - review & editing,contribution to
315,mariangela scorrano,investigation,contribution to
316,mariangela scorrano,data curation,contribution to
317,mariangela scorrano,writing - review & editing,contribution to
318,albert einstein,ulm,born in


### Second Experiment - Load a sample of documents (5)

In [None]:
## Input data directory
data_dir = "documents_sampling"
inputdirectory = Path(f"./data_input/{data_dir}")

## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output")

In [274]:
# Directory containing the .txt files
directory = "./data_input/documents_sampling"

# List to store loaded documents
documents = []

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        loader = TextLoader(file_path)
        documents.extend(loader.load())

# Now, the 'documents' list contains all the documents loaded from the .txt files in the directory.

In [276]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=4500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)


In [280]:
texts = splitter.split_documents(documents)


In [283]:
texts

[Document(page_content='Vol.:(0123456789) Artificial Intelligence Review (2023) 56:4667–4709 https://doi.org/10.1007/s10462-022-10293-3 1 3 Recent advances in the application of deep learning for fault diagnosis of rotating machinery using vibration signals Bayu Adhi Tama1 · Malinda Vania2,3 · Seungchul Lee4,5 · Sunghoon Lim2,3 Published online: 9 October 2022 © The Author(s) 2022 Abstract Vibration measurement and monitoring are essential in a wide variety of applications. Vibration measurements are critical for diagnosing industrial machinery malfunctions because they provide information about the condition of the rotating equipment. Vibra- tion analysis is considered the most effective method for predictive maintenance because it is used to troubleshoot instantaneous faults as well as periodic maintenance. Numer- ous studies conducted in this vein have been published in a variety of outlets. This review documents data-driven and recently published deep learning techniques for vibrat

In [285]:
df_2 = documents2Dataframe(texts)
print(df_2.shape)


(99, 3)


In [288]:
df_2_copy = df_2.copy()

In [290]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df_2_copy)
    print('=========== Extraction done ===============')
    dfg2 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg2.to_csv(outputdirectory/"2_graph.csv", sep=",", index=False)
    df_2.to_csv(outputdirectory/"2_chunks.csv", sep=",", index=False)
else:
    dfg2 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg2.replace("", np.nan, inplace=True)
dfg2.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
#dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg2.shape)
dfg2.head(20)



ERROR ### Here is the buggy response:  response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=glm.GenerateContentResponse({'candidates': [{'content': {'parts': [{'text': '**Expected Output (list of JSON objects):**\n\n[\n  {\n    "node_1": "Convolutional neural network (CNN)",\n    "node_2": "Data with a grid-like topology",\n    "edge": "specializes in processing"\n  },\n  {\n    "node_1": "CNN",\n    "node_2": "LeNet-5 architecture",\n    "edge": "first introduced in"\n  },\n  {\n    "node_1": "CNN",\n    "node_2": "Image data",\n    "edge": "typically utilized on"\n  },\n  {\n    "node_1": "CNN",\n    "node_2": "Two-dimensional CNN",\n    "edge": "known as"\n  },\n  {\n    "node_1": "CNN",\n    "node_2": "Time series data",\n    "edge": "used for"\n  },\n  {\n    "node_1": "1D CNN",\n    "node_2": "Kernels and feature maps",\n    "edge": "represented by 1D arrays"\n  },\n  {\n    "node_1": "1D CNN",\n    "node_2": "Computational complexity",\n    "edge": "

Unnamed: 0,node_1,node_2,edge,weight
0,vibration measurement,vibration signal,associated with,
1,vibration measurement,fault diagnosis,used for,
2,vibration measurement,rotating machinery,of,
3,vibration analysis,vibration-based condition monitoring,type of,
4,vibration analysis,predictive maintenance,used for,
5,deep learning,fault diagnosis,used for,
6,deep learning,vibration signal,based on,
7,department of information systems,university of maryland,located at,
8,department of industrial engineering,ulsan national institute of science and techno...,located at,
9,industry intelligentization institute,ulsan national institute of science and techno...,located at,
