In [9]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex
from llama_index.core import Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage.index_store import SimpleIndexStore
from llama_index.core.vector_stores import SimpleVectorStore
from llama_index.core import StorageContext
from llama_index.core.query_pipeline import QueryPipeline
#from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.agent.openai import OpenAIAgent
#from llama_index.llms.openai_like import OpenAILike
from llama_index.core.tools import QueryEngineTool, ToolMetadata
import os
from llama_index.llms.openai import OpenAI
from llama_index.core.agent import ReActAgent
from llama_index.agent.openai import OpenAIAgent
from llama_index.core import (
    load_index_from_storage,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.core import SummaryIndex
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.node_parser import SentenceSplitter
import os
from tqdm.notebook import tqdm
import pickle
from pathlib import Path





# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()

# Retrieve the OpenAI API key from the environment
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    
def read_data(input_dir, num_workers=4):
    reader = SimpleDirectoryReader(input_dir=input_dir)
    documents = reader.load_data()
    return documents

def build_index(documents):
    # create the pipeline with transformations
    #embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
    embed_model = OpenAIEmbedding()
    #index = VectorStoreIndex.from_documents(documents=documents, transformations=[embed_model])
    pipeline = IngestionPipeline(
        transformations=[
            #SentenceSplitter(chunk_size=25, chunk_overlap=0),
            #TitleExtractor(),
            #OpenAIEmbedding(),
            embed_model
        ],
    )
    
    nodes = pipeline.run(documents=documents)
    index = VectorStoreIndex(nodes, embed_model=embed_model)

    return index


def save_emb(documents, persist_dir):
    index = build_index(documents)
    
    storage_context = StorageContext.from_defaults(
        docstore=SimpleDocumentStore(),
        vector_store=SimpleVectorStore(),
        index_store=SimpleIndexStore(),
    )
    index.storage_context.persist(persist_dir=persist_dir)

def load_emb_index(self, persist_dir):
    # storage_context = StorageContext.from_defaults(
    #     docstore=SimpleDocumentStore.from_persist_dir(persist_dir=persist_dir),
    #     vector_store=SimpleVectorStore.from_persist_dir(persist_dir=persist_dir),
    #     index_store=SimpleIndexStore.from_persist_dir(persist_dir=persist_dir)
    # )
    index_store=SimpleIndexStore.from_persist_dir(persist_dir=persist_dir)
    return index_store
    

async def build_agent_per_doc(nodes, file_base):
    print(file_base)

    vi_out_path = f"./data/llamaindex_docs/{file_base}"
    summary_out_path = f"./data/llamaindex_docs/{file_base}_summary.pkl"
    if not os.path.exists(vi_out_path):
        Path("./data/llamaindex_docs/").mkdir(parents=True, exist_ok=True)
        # build vector index
        vector_index = VectorStoreIndex(nodes)
        vector_index.storage_context.persist(persist_dir=vi_out_path)
    else:
        vector_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=vi_out_path),
        )

    # build summary index
    summary_index = SummaryIndex(nodes)

    # define query engines
    vector_query_engine = vector_index.as_query_engine(llm=llm)
    summary_query_engine = summary_index.as_query_engine(
        response_mode="tree_summarize", llm=llm
    )

    # extract a summary
    if not os.path.exists(summary_out_path):
        Path(summary_out_path).parent.mkdir(parents=True, exist_ok=True)
        summary = str(
            await summary_query_engine.aquery(
                "Extract a concise 1-2 line summary of this document"
            )
        )
        pickle.dump(summary, open(summary_out_path, "wb"))
    else:
        summary = pickle.load(open(summary_out_path, "rb"))

    # define tools
    query_engine_tools = [
        QueryEngineTool(
            query_engine=vector_query_engine,
            metadata=ToolMetadata(
                name=f"vector_tool_{file_base}",
                description=f"Useful for questions related to specific facts",
            ),
        ),
        QueryEngineTool(
            query_engine=summary_query_engine,
            metadata=ToolMetadata(
                name=f"summary_tool_{file_base}",
                description=f"Useful for summarization questions",
            ),
        ),
    ]

    # build agent
    function_llm = OpenAI(model="gpt-4")
    agent = OpenAIAgent.from_tools(
        query_engine_tools,
        llm=function_llm,
        verbose=True,
        system_prompt=f"""\
You are a specialized agent designed to answer queries about the `{file_base}.html` part of the LlamaIndex docs.
You must ALWAYS use at least one of the tools provided when answering a question; do NOT rely on prior knowledge.\
""",
    )

    return agent, summary


async def build_agents(docs):
    node_parser = SentenceSplitter()

    # Build agents dictionary
    agents_dict = {}
    extra_info_dict = {}

    # # this is for the baseline
    # all_nodes = []

    for idx, doc in enumerate(tqdm(docs)):
        nodes = node_parser.get_nodes_from_documents([doc])
        # all_nodes.extend(nodes)

        # ID will be base + parent
        file_path = Path(doc.metadata["path"])
        file_base = str(file_path.parent.stem) + "_" + str(file_path.stem)
        agent, summary = await build_agent_per_doc(nodes, file_base)

        agents_dict[file_base] = agent
        extra_info_dict[file_base] = {"summary": summary, "nodes": nodes}

    return agents_dict, extra_info_dict
        

In [68]:
def read_folder(path):
    # Walk through all files in the given path and return a list of documents
    documents = []
    all_files = []
    for root, dirs, files in os.walk(path):
        all_files.append(files)
        for file in files:
            document_path = os.path.join(root, file)
            documents.append(SimpleDirectoryReader(input_files=[document_path]).load_data())
            

    indexes = []
    for i in documents:
        indexes.append(VectorStoreIndex.from_documents(documents=i))
        
    return indexes, files

# def create_engines_tools(indexes):
#     query_engine_tools = [
#     QueryEngineTool(
#         query_engine=index.as_query_engine(similarity_top_k=3),
#         metadata=ToolMetadata(
#             name=f"engine_{i}",
#             description="Provides information from the car data. "
#                         "Use a detailed plain text question as input to the tool."
#         ),
#     ) for i, index in enumerate(indexes)
#     ]
#     return query_engine_tools

In [69]:
indexes, files = read_folder('data/')

llm = OpenAI(model="gpt-3.5-turbo-0613")
#print(files)

#tools = create_engines_tools(indexes)

tools = [QueryEngineTool.from_defaults(
    indexes[i].as_query_engine(), name=f"engine_{i}", description=f"Provides information about {files[i]}"
) for i in range(len(indexes))]


agent = OpenAIAgent.from_tools(
    tools=tools,
    llm=llm,
    verbose=True
)

In [91]:
from pydantic import BaseModel, Field
from typing import List

class Customer(BaseModel):
    """Data model for a customer behaviour."""
    Name: str = Field(..., description="Name of the customer")
    Type: str = Field(..., description="Type of the customer profile")
    Country: str = Field(..., description="Country")
    Age: str = Field(..., description="Age of the customers")
    AgeGroupWithSignificance: str = Field(..., description="Age group with significant presence")
    Gender: str = Field(..., description="Gender of the customer")
    IncomeLevel: str = Field(..., description="Income level of the customer")
    Residence: str = Field(..., description="Customer residences")
    Occupation: str = Field(..., description="Common occupations of the customer")
    VehicleOwnershipCount: str = Field(..., description="Number of vehicles owned by the customer")
    VehicleOwnershipPreferences: str = Field(..., description="Vehicle preferences of the customer")
    VehicleOwnershipDuration: str = Field(..., description="Ownership duration of the vehicles")
    PriceSensitivity: str = Field(..., description="Price sensitivity of the customers")
    SpendingMotivators: str = Field(..., description="Factors motivating customer spending")
    Values: str = Field(..., description="Values important to the customers")
    BrandLoyaltyLevel: str = Field(..., description="Level of brand loyalty among the customer")
    InterestInNewBrands: str = Field(..., description="Customer interest in new brands")
    PersonalInterests: str = Field(..., description="Personal interests of the customers")
    ValuesTradition: str = Field(..., description="Whether the customers value tradition")
    EngagementInitialStages: str = Field(..., description="Initial engagement stages preferred by the customers")
    TransactionPreference: str = Field(..., description="Transaction preferences of the customers")
    InformationSeeking: str = Field(..., description="Information seeking behavior of the customers")
    ServiceAppointmentPreferences: str = Field(..., description="Service appointment preferences")
    VehicleServicePickUpService: str = Field(..., description="Preference for vehicle pick-up service")
    LoanerVehicleRequirement: str = Field(..., description="Requirement for a loaner vehicle during service")
    TargetDemographic: str = Field(..., description="Target demographic for marketing")
    LuxuryExperienceWillingness: str = Field(..., description="Willingness for a luxury experience")
    DigitalEngagement: str = Field(..., description="Preferred digital platforms and engagement level")
    CommunicationPreferences: str = Field(..., description="Preferred methods of communication")
    PurchaseDecisionInfluencers: str = Field(..., description="Key influencers of purchase decisions")
    BrandPerception: str = Field(..., description="Perception of different brands")
    EnvironmentalConsciousness: str = Field(..., description="Awareness and concern for environmental issues")
    LoyaltyProgramAffiliation: str = Field(..., description="Participation in loyalty programs")
    FeedbackLikelihood: str = Field(..., description="Likelihood to provide feedback or reviews")
    SocialMediaActivity: str = Field(..., description="Level of activity on social media platforms")
    LeisureActivities: str = Field(..., description="Common leisure activities")
    ShoppingPreferences: str = Field(..., description="Preferred shopping channels and styles")
    TechnologyAdoptionRate: str = Field(..., description="Rate at which new technology is adopted")
    HealthAndWellnessConcerns: str = Field(..., description="Health and wellness concerns and priorities")
    EducationLevel: str = Field(..., description="Highest level of education attained")
    FamilyStatus: str = Field(..., description="Family composition and marital status")
    CulturalAffinities: str = Field(..., description="Cultural groups or activities with which the customer identifies")
    AccessibilityRequirements: str = Field(..., description="Any special accessibility requirements")
    PreferredPaymentMethods: str = Field(..., description="Favored methods for transactions")
    TravelFrequency: str = Field(..., description="Frequency of travel for leisure or business")
    MediaConsumptionHabits: str = Field(..., description="Preferred types of media and consumption habits")
    RiskTolerance: str = Field(..., description="Willingness to engage in risky activities or investments")
    CommunityInvolvement: str = Field(..., description="Level of involvement in local or online communities")
    PoliticalViews: str = Field(..., description="Political orientation or views")
    
from llama_index.program.openai import OpenAIPydanticProgram
from llama_index.core import ChatPromptTemplate
from llama_index.core.llms import ChatMessage

prompt = ChatPromptTemplate(
    message_templates=[
        ChatMessage(
            role="system",
            content=(
                "You are an expert assistant for summarizing and extracting personality of a user from a text. If you don't find information, leave is as an empty string."
            ),
        ),
        ChatMessage(
            role="user",
            content=(
                "Here is the text: \n"
                "------\n"
                "{text}\n"
                "------"
            ),
        ),
    ]
)
program = OpenAIPydanticProgram.from_defaults(
    output_cls=Customer,
    llm=llm,
    prompt=prompt,
    verbose=True,
)

text = "Hi, my name is Sarah and love fast cars. I think Biden is a good president"
output = program(text=text)


Function call: Customer with args: {
  "Name": "Sarah",
  "Type": "",
  "Country": "",
  "Age": "",
  "AgeGroupWithSignificance": "",
  "Gender": "",
  "IncomeLevel": "",
  "Residence": "",
  "Occupation": "",
  "VehicleOwnershipCount": "",
  "VehicleOwnershipPreferences": "",
  "VehicleOwnershipDuration": "",
  "PriceSensitivity": "",
  "SpendingMotivators": "",
  "Values": "",
  "BrandLoyaltyLevel": "",
  "InterestInNewBrands": "",
  "PersonalInterests": "fast cars",
  "ValuesTradition": "",
  "EngagementInitialStages": "",
  "TransactionPreference": "",
  "InformationSeeking": "",
  "ServiceAppointmentPreferences": "",
  "VehicleServicePickUpService": "",
  "LoanerVehicleRequirement": "",
  "TargetDemographic": "",
  "LuxuryExperienceWillingness": "",
  "DigitalEngagement": "",
  "CommunicationPreferences": "",
  "PurchaseDecisionInfluencers": "",
  "BrandPerception": "",
  "EnvironmentalConsciousness": false,
  "LoyaltyProgramAffiliation": "",
  "FeedbackLikelihood": "",
  "SocialM

In [92]:
non_empty_attributes = {k: v for k, v in output.dict().items() if v}
non_empty_attributes

{'Name': 'Sarah',
 'PersonalInterests': 'fast cars',
 'PoliticalViews': 'Biden is a good president'}

In [94]:
import json
#Map the available persona with the possible classes from people.json
def choose_class(persona):
    with open('people.json') as json_file:
        data = json.load(json_file)
        print(data)
        # for class_type in data:
        #     # Implement your matching logic here
        #     # Example:
        #     # if persona['PersonalInterests'] in class_type['Psychology']['PersonalInterests']:
        #     #     persona_class = class_type['Type']
        #     #     break
        # return persona_class

choose_class(non_empty_attributes)

{'Type': 'The Discerning Connoisseur', 'Demographics': {'InternationalCustomerPercentage': '4%', 'StrongPresenceCountries': ['USA', 'UK', 'Germany'], 'AverageAge': 39, 'AgeGroupWithSignificance': 'over 55', 'Gender': 'Predominantly male', 'IncomeLevel': 'High earners', 'Residence': ['rural areas', 'smaller cities', 'suburbs'], 'Occupation': ['leadership roles', 'retired'], 'VehicleOwnership': {'Count': 'multiple', 'Preferences': 'large luxury cars', 'OwnershipDuration': 'long-term'}}, 'Psychology': {'PriceSensitivity': 'low', 'SpendingMotivators': ['luxury', 'convenience', 'quality'], 'Values': ['brand prestige', 'maintenance', 'resale value'], 'BrandLoyalty': {'Level': 'high', 'InterestInNewBrands': 'low'}, 'PersonalInterests': ['tennis', 'theater', 'cruising'], 'ValuesTradition': True}, 'CustomerExperience': {'Engagement': {'InitialStages': 'digital', 'TransactionPreference': 'traditional'}, 'InformationSeeking': 'low due to brand loyalty', 'ServiceAppointmentPreferences': ['phone', 

In [70]:
agent.chat("What are the characteristics of Volkswagen ID.3 Pro")

Added user message to memory: What are the characteristics of Volkswagen ID.3 Pro
=== Calling Function ===
Calling function: engine_0 with args: {
  "input": "Volkswagen ID.3 Pro"
}
Got output: 35575

=== Calling Function ===
Calling function: engine_1 with args: {
  "input": "Volkswagen ID.3 Pro"
}
Got output: The Volkswagen ID.3 Pro is an electric car model that falls under the category of Battery Electric Vehicles (BEV). It is designed to be powered by electricity, offering high energy efficiency compared to traditional Internal Combustion Engine Vehicles (ICEVs). The ID.3 Pro is part of Volkswagen's initiative to provide sustainable mobility solutions by utilizing electric propulsion technology.

=== Calling Function ===
Calling function: engine_3 with args: {
  "input": "Volkswagen ID.3 Pro"
}
Got output: The Volkswagen ID.3 Pro is not mentioned in the provided context information.

=== Calling Function ===
Calling function: engine_4 with args: {
  "input": "Volkswagen ID.3 Pro"
}

AgentChatResponse(response="I apologize, but I couldn't find specific characteristics of the Volkswagen ID.3 Pro in the available data sources. However, the Volkswagen ID.3 Pro is an electric car model that offers high energy efficiency and is designed to be powered by electricity. It is part of Volkswagen's initiative to provide sustainable mobility solutions. For more detailed information about the characteristics of the Volkswagen ID.3 Pro, I recommend visiting the official Volkswagen website or contacting a Volkswagen dealership.", sources=[ToolOutput(content='35575', tool_name='engine_0', raw_input={'input': 'Volkswagen ID.3 Pro'}, raw_output=Response(response='35575', source_nodes=[NodeWithScore(node=TextNode(id_='7381bb9a-af7c-489d-96e6-0dae39eb9a01', embedding=None, metadata={'file_path': 'data/ElectricCarData_Clean.csv', 'file_name': 'ElectricCarData_Clean.csv', 'file_type': 'text/csv', 'file_size': 8195, 'creation_date': '2024-04-27', 'last_modified_date': '2024-04-27'}, excl

In [12]:

documents = read_data('data/')
#llama.save_emb(documents, 'storage/')
#index = build_index(documents)
#llm = OpenAILike(model="NousResearch/Hermes-2-Pro-Mistral-7B",api_base="http://localhost:8000/v1", api_key="fake")
llm = OpenAI(model="gpt-3.5-turbo-0613")

# main = index.as_query_engine(similarity_top_k=3)

query_engine_tools = [
    QueryEngineTool(
        query_engine=main,
        metadata=ToolMetadata(
            name="main",
            description="Provides information the data "
            "Use a detailed plain text question as input to the tool.",
        ),
    )
]
agents_dict, extra_info_dict = await build_agents(documents)
#agent = ReActAgent.from_tools(query_engine_tools, llm=llm, verbose=True)

#print(agent.chat('What do you know?'))

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html