# **Setting up the Environment**

In [12]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain_openai import ChatOpenAI
import os
import warnings
from dotenv import load_dotenv
from typing import Dict, Any, Optional, List
from pathlib import Path

warnings.filterwarnings('ignore')
load_dotenv()

# Set environment variables
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

# **Load and Process Documents**

In [13]:
def load_and_process_documents(
    directory: str, 
    embeddings: OpenAIEmbeddings, 
    text_splitter: RecursiveCharacterTextSplitter
) -> Optional[FAISS]:
    """
    Load and process documents from the specified directory and create a FAISS instance.

    Args:
        directory (str): The directory containing the PDF documents.
        embeddings (OpenAIEmbeddings): The embeddings model.
        text_splitter (RecursiveCharacterTextSplitter): The text splitter.

    Returns:
        Optional[FAISS]: The FAISS instance or None if no documents were found.
    """
    loader = PyPDFDirectoryLoader(directory)
    documents = loader.load()
    final_documents = text_splitter.split_documents(documents)

    if not final_documents:
        print(f"No documents found in the '{directory}' directory.")
        return None

    faiss_instance = FAISS.from_documents(final_documents, embeddings)
    return faiss_instance

# **Define the Prompt for Literature Review**

In [14]:
literature_review_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    Based on the following research papers:
    {context}
    
    Write a comprehensive literature review addressing the research question:
    {question}
    
    The literature review should:
    1. Summarize key findings and methodologies
    2. Identify trends and patterns in the research
    3. Highlight gaps or contradictions in the existing literature
    4. Explain how the current research relates to previous studies
    """
)

# **Setup the Chain**

In [15]:
def setup_chain(
    model_name: str = "gpt-3.5-turbo", 
    temperature: float = 0.7
) -> StuffDocumentsChain:
    """
    Set up the LLM chain with the specified model and temperature.

    Args:
        model_name (str): The model name.
        temperature (float): The temperature for the model.

    Returns:
        StuffDocumentsChain: The configured chain.
    """
    llm = ChatOpenAI(model_name=model_name, temperature=temperature)
    llm_chain = LLMChain(llm=llm, prompt=literature_review_template)
    return StuffDocumentsChain(
        llm_chain=llm_chain,
        document_variable_name="context",
    )

# **Generate Literature Review**

In [16]:
def generate_literature_review(
    research_question: str,
    db: FAISS,
    chain: StuffDocumentsChain,
    k: int = 5
) -> Dict[str, Any]:
    """
    Generate a literature review based on the research question using the FAISS database and the chain.

    Args:
        research_question (str): The research question.
        db (FAISS): The FAISS instance.
        chain (StuffDocumentsChain): The chain to process the documents.
        k (int): The number of documents to retrieve.

    Returns:
        Dict[str, Any]: The result containing the literature review, number of source documents, and source document metadata.
    """
    try:
        retriever = db.as_retriever(search_kwargs={"k": k})
        docs = retriever.get_relevant_documents(research_question)
        result = chain.run(input_documents=docs, question=research_question)

        return {
            "literature_review": result,
            "num_source_documents": len(docs),
            "source_documents": [doc.metadata for doc in docs]
        }
    except Exception as e:
        return {"error": str(e)}


# **Main Function**

In [17]:
# Usage
if __name__ == "__main__":
    # Define embeddings and text splitter
    # embeddings = OpenAIEmbeddings()
    # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    
    # # Load and process documents (with caching)
    # db = load_and_process_documents("Research-Papers", embeddings, text_splitter)

    # if db is None:
    #     print("No documents found. Exiting...")
    #     exit()

    # # Setup chain
    # chain = setup_chain(model_name="gpt-3.5-turbo", temperature=0.7)

    # Generate literature review
    research_question = "Tell me about these research papers and how they relate to bearing?"
    result = generate_literature_review(research_question, db, chain, k=30)
    
    if "error" in result:
        print(f"An error occurred: {result['error']}")
    else:
        print("Literature Review:")
        print(result["literature_review"])
        print(f"\nNumber of source documents: {result['num_source_documents']}")
        print("\nSource document metadata:")
        for i, metadata in enumerate(result["source_documents"], 1):
            print(f"Document {i}: {metadata}")

Literature Review:
The research papers discussed in this literature review focus on the fault diagnosis and analysis of bearings, particularly in the context of rotating machinery such as induction motors. The papers employ various methods, including machine learning algorithms, feature extraction, and statistical analysis of vibration signals, to predict and diagnose bearing faults. 

Patel and Giri (2016) and Patel and Giri (2018) present studies on the condition monitoring of induction motor bearings using Artificial Neural Networks (ANN) and feature extraction for bearing fault analysis. These papers emphasize the importance of monitoring and analyzing bearing conditions to prevent breakdowns in machinery.

Pham, Kim, and Kim (2020) and Plakias and Boutalis (2020) introduce deep learning-based methods for bearing fault diagnosis, specifically focusing on embedded systems and attentive fault detection and identification in rolling element bearings. These papers highlight the advance

In [24]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain_openai import ChatOpenAI
import os
import warnings
from dotenv import load_dotenv
from typing import Dict, Any, Optional, List
from pathlib import Path

warnings.filterwarnings('ignore')
load_dotenv()

# Set environment variables
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

def load_and_process_document(
    file_path: str, 
    embeddings: OpenAIEmbeddings, 
    text_splitter: RecursiveCharacterTextSplitter
) -> Optional[FAISS]:
    """
    Load and process a single PDF document and create a FAISS instance.

    Args:
        file_path (str): The file path of the PDF document.
        embeddings (OpenAIEmbeddings): The embeddings model.
        text_splitter (RecursiveCharacterTextSplitter): The text splitter.

    Returns:
        Optional[FAISS]: The FAISS instance or None if no documents were found.
    """
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    final_documents = text_splitter.split_documents(documents)

    if not final_documents:
        print(f"No documents found in the file '{file_path}'.")
        return None

    faiss_instance = FAISS.from_documents(final_documents, embeddings)
    return faiss_instance

def create_literature_review_prompt(questions: List[str]) -> PromptTemplate:
    """
    Create a PromptTemplate for the literature review using user-defined questions.

    Args:
        questions (List[str]): A list of questions to include in the prompt.

    Returns:
        PromptTemplate: The generated prompt template.
    """
    questions_str = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
    template_str = f"""
    Based on the following research papers:
    {{context}}
    
    Answer the following questions comprehensively, providing detailed summaries and analyses where applicable:
    {questions_str}
    
    The answers should be detailed and integrate information across all the provided papers.
    """
    return PromptTemplate(input_variables=["context", "question"], template=template_str)

def setup_chain(
    questions: List[str],
    model_name: str = "gpt-3.5-turbo", 
    temperature: float = 0.7,
    max_tokens: int = 3000
) -> StuffDocumentsChain:
    """
    Set up the LLM chain with the specified model and temperature.

    Args:
        questions (List[str]): A list of questions to include in the prompt.
        model_name (str): The model name.
        temperature (float): The temperature for the model.
        max_tokens (int): The maximum number of tokens for the model response.

    Returns:
        StuffDocumentsChain: The configured chain.
    """
    prompt_template = create_literature_review_prompt(questions)
    llm = ChatOpenAI(model_name=model_name, temperature=temperature, max_tokens=max_tokens)
    llm_chain = LLMChain(llm=llm, prompt=prompt_template)
    return StuffDocumentsChain(
        llm_chain=llm_chain,
        document_variable_name="context",
    )

def generate_literature_review(
    research_question: str,
    user_questions: List[str],
    db: FAISS,
    chain: StuffDocumentsChain,
    k: int = 5
) -> Dict[str, Any]:
    """
    Generate a literature review based on the research question using the FAISS database and the chain.

    Args:
        research_question (str): The research question.
        user_questions (List[str]): A list of user-defined questions.
        db (FAISS): The FAISS instance.
        chain (StuffDocumentsChain): The chain to process the documents.
        k (int): The number of documents to retrieve.

    Returns:
        Dict[str, Any]: The result containing the literature review, number of source documents, and source document metadata.
    """
    try:
        retriever = db.as_retriever(search_kwargs={"k": k})
        docs = retriever.get_relevant_documents(research_question)
        result = chain.run(input_documents=docs, question=research_question)

        return {
            "literature_review": result,
            "num_source_documents": len(docs),
            "source_documents": [doc.metadata for doc in docs]
        }
    except Exception as e:
        return {"error": str(e)}

def write_results_to_file(file_path: Path, content: str) -> None:
    """
    Write content to a text file.

    Args:
        file_path (Path): The path of the text file.
        content (str): The content to write.
    """
    with open(file_path, 'w') as file:
        file.write(content)

# Usage
if __name__ == "__main__":
    # Define embeddings and text splitter
    embeddings = OpenAIEmbeddings()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)
    
    # Define user questions
    user_questions = [
        "What system is being monitored by the researcher (bearing, axel, some other physical system)?",
        "How are they monitoring that system? (Accelerometer, current sensor etc.)",
        "What failures/operating conditions are they trying to identify?",
        "What pre-processing techniques are they using? (e.g. Fourier transform)",
        "What models are they using to determine machine/operating condition/failure?",
        "How are they evaluating the performance of their model?",
        "What are the conclusions or findings of the paper?"
    ]

    # Setup chain
    chain = setup_chain(user_questions, model_name="gpt-3.5-turbo", temperature=0.7, max_tokens=1600)

    # Get list of all PDF files in the directory
    directory_path = "Research-Papers"
    pdf_files = [f for f in Path(directory_path).glob("*.pdf")]

    # Initialize content accumulator
    accumulated_content = ""

    # Iterate over each PDF file, process it, and generate literature review
    for pdf_file in pdf_files:
        print(f"Processing file: {pdf_file}")

        # Load and process document
        db = load_and_process_document(pdf_file, embeddings, text_splitter)

        if db is None:
            accumulated_content += f"No documents found in the file '{pdf_file}'.\n"
            continue

        # Generate literature review
        research_question = "Provide a comprehensive literature review based on the research papers."
        result = generate_literature_review(research_question, user_questions, db, chain, k=30)

        accumulated_content += f"Literature Review for file '{pdf_file}':\n"
        if "error" in result:
            accumulated_content += f"An error occurred: {result['error']}\n"
        else:
            accumulated_content += f"{result['literature_review']}\n"
            accumulated_content += f"\nNumber of source documents: {result['num_source_documents']}\n"
            accumulated_content += "\nSource document metadata:\n"
            for i, metadata in enumerate(result["source_documents"], 1):
                accumulated_content += f"Document {i}: {metadata}\n"

    # Write all accumulated results to a single text file
    output_file_path = Path(directory_path) / "Consolidated_Literature_Review.txt"
    write_results_to_file(output_file_path, accumulated_content)

    print(f"All results written to '{output_file_path}'")


Processing file: Research-Papers/The quality management ecosystem for predictive maintenance in the Industry 4.0 era_.pdf
Processing file: Research-Papers/From knowledge-based to big data analytic model_ a novel IoT and machine learning based decision support system for predictive maintenance in Industry 4.0.pdf
Processing file: Research-Papers/Application of data-driven models to predictive maintenance_ Bearing wear.pdf
Processing file: Research-Papers/Challenges and Opportunities of Condition-based Predictive.pdf
Processing file: Research-Papers/A systematic literature review of machine learning methods applied to_predictive maintenance.pdf
Processing file: Research-Papers/An integrated fault diagnosis and prognosis approach for predictive_maintenance of wind turbine bearing with limited samples.pdf
Processing file: Research-Papers/Predictive Maintenance in Building Facilities_ A Machine Learning-Based Approach_.pdf
Processing file: Research-Papers/Predictive maintenance enabled by m

In [28]:
from pathlib import Path

def clean_file(file_path: Path) -> None:
    """
    Remove specific sections from the content of a file.

    Args:
        file_path (Path): The path of the text file to be cleaned.
    """
    # Read the content of the file
    with open(file_path, 'r') as file:
        content = file.read()

    # Define the sections to be removed
    section_start1 = "\nNumber of source documents:"
    section_start2 = "\nSource document metadata:"
    
    # Find the start positions of sections to remove
    pos_start1 = content.find(section_start1)
    pos_start2 = content.find(section_start2)

    if pos_start1 != -1:
        # Remove the section starting from "Number of source documents:"
        end_pos1 = content.find("\n\n", pos_start1) if content.find("\n\n", pos_start1) != -1 else len(content)
        content = content[:pos_start1] + content[end_pos1:]
    
    if pos_start2 != -1:
        # Remove the section starting from "Source document metadata:"
        end_pos2 = content.find("\n\n", pos_start2) if content.find("\n\n", pos_start2) != -1 else len(content)
        content = content[:pos_start2] + content[end_pos2:]
    
    # Write the cleaned content back to the file
    with open(file_path, 'w') as file:
        file.write(content)
        print(content)

    print(f"File '{file_path}' cleaned successfully.")

# Usage
if __name__ == "__main__":
    # Define the path to the file
    file_path = Path("Research-Papers/Consolidated_Literature_Review.txt")
    
    # Clean the file
    clean_file(file_path)


Literature Review for file 'Research-Papers/The quality management ecosystem for predictive maintenance in the Industry 4.0 era_.pdf':
1. The researchers are monitoring various physical systems and components such as engines, turbines, machinery, and equipment in industries such as manufacturing and service.

2. They are monitoring these systems using advanced digital technologies such as big data analytics, AI, smart sensors, IoT, and platform construction. These technologies collect real-time data from sensors embedded in the equipment to monitor conditions and performance.

3. The researchers are trying to identify potential failures, defects, or abnormalities in the monitored systems. They are focusing on predictive maintenance to detect issues before they escalate and cause downtime or safety concerns. They are also monitoring operating conditions to optimize performance and prevent failures.

4. The pre-processing techniques used include data analysis, signal processing, and mach