In [None]:
import os
import pandas as pd
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
import re
import numpy as np
from langchain_community.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings


def preprocess_pdf_files_for_LLM(path: str):
    pdf_docs = []
    for file in os.listdir(path):
        if file.endswith(".pdf"):
            pdf_docs.append(os.path.join(path, file))
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)

    embeddings = OpenAIEmbeddings()
    # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings)
    return vectorstore

In [6]:
# Get all pdf files from a directory
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader


loader_pdfs = DirectoryLoader('C:\\Users\\marce\\OneDrive\\Documentos\\GitHub\\Capstone-Project\\DataFiltered2022_2023\\data filtered', show_progress=True, use_multithreading=True, loader_cls=TextLoader, glob='**/*.pdf')

pdfs = loader_pdfs.load()

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len)

chunks = text_splitter.split_text(pdfs)

embeddings = OpenAIEmbeddings()
# embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings)

100%|██████████| 12/12 [00:00<00:00, 4001.24it/s]


TypeError: expected string or bytes-like object, got 'list'

In [2]:
def preprocess_csv_files_for_LLM(path: str):
    csv_docs = []
    for file in os.listdir(path):
        if file.endswith(".csv"):
            csv_docs.append(os.path.join(path, file))
    text = ""
    for csv in csv_docs:
        df = pd.read_csv(csv)
        for col in df.columns:
            text += col + "\n"
            for row in df[col]:
                text += str(row) + "\n"
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)

    embeddings = OpenAIEmbeddings()
    # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings)
    return vectorstore


In [5]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
loader = DirectoryLoader('C:\\Users\\marce\\OneDrive\\Documentos\\GitHub\\Capstone-Project\\DataFiltered2022_2023\\data filtered', glob="**/*.csv", loader_cls=CSVLoader)
csv_data = loader.load()

#from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

from langchain.vectorstores import Chroma
recursive_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=750,
    chunk_overlap=200,
)
text_chunks = recursive_text_splitter.split_documents(csv_data)


embeddings = OpenAIEmbeddings()
#embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


#embeddings = OpenAIEmbeddings()
# embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
db = FAISS.from_documents(text_chunks, embeddings)



KeyboardInterrupt



In [13]:
query = "Who won the 2023 Bahrain GP?"
docs = db.similarity_search(query, k=3)
print("Results:", docs)

Results: [Document(page_content=': 1079\nraceId: 1098\nyear: 2023\nround: 1\ncircuitId: 3\nname: Bahrain Grand Prix\ndate: 2023-03-05\ntime: 15:00:00\nurl: https://en.wikipedia.org/wiki/2023_Bahrain_Grand_Prix\nfp1_date: 2023-03-03\nfp1_time: 11:30:00\nfp2_date: 2023-03-03\nfp2_time: 15:00:00\nfp3_date: 2023-03-04\nfp3_time: 11:30:00\nquali_date: 2023-03-04\nquali_time: 15:00:00\nsprint_date: \\N\nsprint_time: \\N', metadata={'source': 'C:\\Users\\marce\\OneDrive\\Documentos\\GitHub\\Capstone-Project\\DataFiltered2022_2023\\data filtered\\races_filtered.csv', 'row': 22}), Document(page_content=': 1036\nraceId: 1074\nyear: 2022\nround: 1\ncircuitId: 3\nname: Bahrain Grand Prix\ndate: 2022-03-20\ntime: 15:00:00\nurl: http://en.wikipedia.org/wiki/2022_Bahrain_Grand_Prix\nfp1_date: 2022-03-18\nfp1_time: 12:00:00\nfp2_date: 2022-03-18\nfp2_time: 15:00:00\nfp3_date: 2022-03-19\nfp3_time: 12:00:00\nquali_date: 2022-03-19\nquali_time: 15:00:00\nsprint_date: \\N\nsprint_time: \\N', metadata={'s

In [5]:
from langchain.llms import CTransformers
from langchain.chains import ConversationalRetrievalChain
#llm = CTransformers(model_type="gpt-3.5-turbo", temperature=0.1)
from langchain_community.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1)

qa = ConversationalRetrievalChain.from_llm(llm, retriever = pdf_data.as_retriever())

In [6]:
while True:
    # Give system some context
    chat_history = []
    query = input("Enter query: ")
    if query == "exit":
        break
    if query == "clear":
        chat_history = []
        continue
    if query == "":
        continue
    
    
    results = qa({"chat_history": chat_history, "question": query})
    print('User:', query)
    print("Results:", results['answer'])

Results: Based on the provided information, I don't have access to the specific data or files that contain the results of the Australian GP in 2023. Therefore, I cannot determine which driver won the race.
Results: Is there anything else you would like to know?


In [11]:

from langchain_experimental.agents import create_csv_agent

agent = create_csv_agent(OpenAI(temperature=0), ['titanic.csv', 'titanic_age_fillna.csv'], verbose=True)

<langchain_community.vectorstores.chroma.Chroma at 0x16306aeb4d0>

In [10]:
query = "How many rows does drivers_filtered have?"

docs = db.similarity_search(query)

print(docs[0].page_content)

: 33700
driverStandingsId: 71578
raceId: 1087
driverId: 846
points: 76.0
position: 7
positionText: 7
wins: 0


In [None]:
# ROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
y#kubernetes 28.1.0 requires urllib3<2.0,>=1.24.2, but you have urllib3 2.1.0 which is incompatible.

In [2]:
from util import local_settings

sk-LCAqduOz4rTSH8NhaGGhT3Bl


In [3]:

OPENAI_API_KEY=local_settings.OPENAI_API_KEY

In [4]:
pdf_data = preprocess_pdf_files_for_LLM(path="C:\\Users\\marce\\OneDrive\\Documentos\\GitHub\\Capstone-Project\\DataFiltered2022_2023\\data filtered")

In [12]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

chain = load_qa_chain(OpenAI(), chain_type="stuff")
prompt = f"You have multiple pdfs file with data formatted as a csv inside. Knowing that, and never forgetting it, you shall look at the appropriate fields when needed and get your result as if you are reading a csv. How many wins did Max Verstappen finish first in 2023?"

docs =  pdf_data.similarity_search(prompt)
result = chain.run(input_documents=docs, question=prompt)
print(result)



I cannot provide a helpful answer as the data provided does not include any information about wins in 2023. The data only goes up to 2022 and does not specifically mention Max Verstappen or his wins. It also does not specify which races were won by Max Verstappen, so I cannot accurately determine the number of wins in 2023.


In [None]:
model="gpt-3.5-turbo"

def get_completion(prompt, temperature= 0, messages = [], model=model):

    message = {"role": "user", "content": prompt}

    messages.append(message)
    docs =  pdf_data.similarity_search(prompt)
    result = chain.run(input_documents=docs, question=prompt)
    completion = chain.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )

    return completion.choices[0].message.content


In [28]:
# import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.chat_models import ChatOpenAI
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Retrieve and generate using the relevant snippets of the blog.
retriever = pdf_data.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
)

rag_chain.invoke("How many times was Verstappen champion?")

'Verstappen was champion 0 times.'

In [42]:

from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent

import pandas as pd
from langchain.llms import OpenAI

df = pd.read_csv("C:\\Users\\marce\\Downloads\\data2022_2023 (1).csv")
df.head()


Unnamed: 0,number_x,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,year,round,nameDriver,date,timeRace,url_x,fp1_date,fp1_time,fp2_date,fp2_time,fp3_date,fp3_time,quali_date,quali_time,sprint_date,sprint_time,numberQualifying,positionQualifying,q1,q2,q3,numberSprint,gridSprint,positionSprint,positionTextSprint,positionOrderSprint,pointsSprint,lapsSprint,timeSprint,millisecondsSprint,fastestLapSprint,fastestLapTimeSprint,driverRef,number_y,code,forename,surname,dob,nationalityDriver,url_y,constructorRef,nameConstructor,nationalityConstructor,url_x.1,status,circuitRef,name,location,country,lat,lng,alt,url_y.1,pointsStandings,positionStandings,positionTextStandings,wins,pointsStandings.1,positionStandings.1,positionTextStandings.1,winsStandings,traction,track_evolution,braking,asphalt_grip,lateral,asphalt_abrasion,tyre_stress,downforce,c1_compound,c2_compound,c3_compound,c4_compound,c5_compound,lapsTotal,race_dist,circuit_length,turns,elevation_change,top_speed,tyre_camber_lim_front,tyre_camber_lim_rear,min_starting_press_front,min_starting_press_rear
0,16,1,1,1,1,26.0,57,1:37:33.584,5853584,51,1,1:34.570,206.018,2022,1,Bahrain Grand Prix,2022-03-20,15:00:00,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,2022-03-18,12:00:00,2022-03-18,15:00:00,2022-03-19,12:00:00,2022-03-19,15:00:00,\N,\N,16.0,1.0,1:31.471,1:30.932,1:30.558,,,,,,,,,,,,leclerc,16,LEC,Charles,Leclerc,1997-10-16,Monegasque,http://en.wikipedia.org/wiki/Charles_Leclerc,ferrari,Ferrari,Italian,http://en.wikipedia.org/wiki/Scuderia_Ferrari,Finished,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_Internati...,26.0,1.0,1,1.0,44.0,1.0,1,1.0,4.0,4.0,4.0,3.0,3.0,5.0,3.0,3.0,1.0,1.0,1.0,0.0,0.0,57.0,308.238,5.412,15.0,16.8,327.9,-3.5,-2.0,21.0,18.5
1,55,3,2,2,2,18.0,57,+5.598,5859182,52,3,1:35.740,203.501,2022,1,Bahrain Grand Prix,2022-03-20,15:00:00,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,2022-03-18,12:00:00,2022-03-18,15:00:00,2022-03-19,12:00:00,2022-03-19,15:00:00,\N,\N,55.0,3.0,1:31.567,1:30.787,1:30.687,,,,,,,,,,,,sainz,55,SAI,Carlos,Sainz,1994-09-01,Spanish,http://en.wikipedia.org/wiki/Carlos_Sainz_Jr.,ferrari,Ferrari,Italian,http://en.wikipedia.org/wiki/Scuderia_Ferrari,Finished,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_Internati...,18.0,2.0,2,0.0,44.0,1.0,1,1.0,4.0,4.0,4.0,3.0,3.0,5.0,3.0,3.0,1.0,1.0,1.0,0.0,0.0,57.0,308.238,5.412,15.0,16.8,327.9,-3.5,-2.0,21.0,18.5
2,44,5,3,3,3,15.0,57,+9.675,5863259,53,5,1:36.228,202.469,2022,1,Bahrain Grand Prix,2022-03-20,15:00:00,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,2022-03-18,12:00:00,2022-03-18,15:00:00,2022-03-19,12:00:00,2022-03-19,15:00:00,\N,\N,44.0,5.0,1:32.285,1:31.048,1:31.238,,,,,,,,,,,,hamilton,44,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton,mercedes,Mercedes,German,http://en.wikipedia.org/wiki/Mercedes-Benz_in_...,Finished,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_Internati...,15.0,3.0,3,0.0,27.0,2.0,2,0.0,4.0,4.0,4.0,3.0,3.0,5.0,3.0,3.0,1.0,1.0,1.0,0.0,0.0,57.0,308.238,5.412,15.0,16.8,327.9,-3.5,-2.0,21.0,18.5
3,63,9,4,4,4,12.0,57,+11.211,5864795,56,6,1:36.302,202.313,2022,1,Bahrain Grand Prix,2022-03-20,15:00:00,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,2022-03-18,12:00:00,2022-03-18,15:00:00,2022-03-19,12:00:00,2022-03-19,15:00:00,\N,\N,63.0,9.0,1:32.269,1:31.252,1:32.216,,,,,,,,,,,,russell,63,RUS,George,Russell,1998-02-15,British,http://en.wikipedia.org/wiki/George_Russell_(r...,mercedes,Mercedes,German,http://en.wikipedia.org/wiki/Mercedes-Benz_in_...,Finished,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_Internati...,12.0,4.0,4,0.0,27.0,2.0,2,0.0,4.0,4.0,4.0,3.0,3.0,5.0,3.0,3.0,1.0,1.0,1.0,0.0,0.0,57.0,308.238,5.412,15.0,16.8,327.9,-3.5,-2.0,21.0,18.5
4,20,7,5,5,5,10.0,57,+14.754,5868338,53,8,1:36.623,201.641,2022,1,Bahrain Grand Prix,2022-03-20,15:00:00,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,2022-03-18,12:00:00,2022-03-18,15:00:00,2022-03-19,12:00:00,2022-03-19,15:00:00,\N,\N,20.0,7.0,1:31.955,1:31.461,1:31.808,,,,,,,,,,,,kevin_magnussen,20,MAG,Kevin,Magnussen,1992-10-05,Danish,http://en.wikipedia.org/wiki/Kevin_Magnussen,haas,Haas F1 Team,American,http://en.wikipedia.org/wiki/Haas_F1_Team,Finished,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_Internati...,10.0,5.0,5,0.0,10.0,3.0,3,0.0,4.0,4.0,4.0,3.0,3.0,5.0,3.0,3.0,1.0,1.0,1.0,0.0,0.0,57.0,308.238,5.412,15.0,16.8,327.9,-3.5,-2.0,21.0,18.5


In [43]:
agent = create_pandas_dataframe_agent(OpenAI(temperature=0), df, verbose=True)


In [46]:
agent.run("How many wins did Verstappen (surname) get in 2023 only?")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: We need to filter the dataframe to only include rows where the surname is "Verstappen" and the year is 2023. Then, we can count the number of rows in the filtered dataframe.
Action: python_repl_ast
Action Input: df[(df["surname"] == "Verstappen") & (df["year"] == 2023)][0m
Observation: [36;1m[1;3m     number_x  grid position positionText  positionOrder  points  laps  \
440         1     1        1            1              1    25.0    57   
461         1    15        2            2              2    19.0    50   
480         1     1        1            1              1    25.0    58   
501         1     2        2            2              2    18.0    51   
520         1     9        1            1              1    26.0    57   
540         1     1        1            1              1    25.0    78   
560         1     1        1            1              1    26.0    66   
580         1     1        1         

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 4097 tokens, however you requested 10414 tokens (10158 in your prompt; 256 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}