In [34]:
from langchain.document_loaders import PyPDFLoader
import re
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOllama
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain


In [2]:
book = 'Soccernomics__World_Cup_Edition_-_Simon_Kuper.pdf'

In [3]:
def load_content():
    loader = PyPDFLoader(book)
    documents = loader.load()
    return documents

In [4]:
docs = load_content()

In [5]:
print(f"Loaded {len(docs)} pages from the book.")

Loaded 434 pages from the book.


In [6]:
def clean_docs(documents):
    cleaned = []
    for doc in documents:
        text = doc.page_content
        # Remove page numbers and extra whitespace
        text = re.sub(r'\n\d+\n', '\n', text)  
        text = re.sub(r'\s+', ' ', text).strip()
        
        cleaned.append(Document(
            page_content=text,
            metadata=doc.metadata
        ))
    return cleaned

docs_cleaned = clean_docs(docs)

In [7]:
text_split = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len,
    add_start_index = True
)
chunks = text_split.split_documents(docs)
print(len(chunks))

1326


In [12]:
embedding = FastEmbedEmbeddings()
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=embedding,
    persist_directory="./soccer_chroma_db"
)

In [13]:
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

In [None]:


prompt = PromptTemplate.from_template(
    """
    <s> [Instructions] You are a friendly assistant. Answer the question based only on the following context.
    If you don't know the answer, then reply, No Context available for this question {input}. [/Instructions] </s>
    [Instructions] Question: {input}
    Context: {context}
    Answer: [/Instructions]
    """
)


model = ChatOllama(model="llama3")


embedding = FastEmbedEmbeddings()
vector_store = Chroma(persist_directory="./soccer_chroma_db", embedding_function=embedding)


retriever = vector_store.as_retriever(search_kwargs={"k": 3})


document_chain = create_stuff_documents_chain(model, prompt)

rag_chain = create_retrieval_chain(retriever, document_chain)


def ask(query: str):
    result = rag_chain.invoke({"input": query})
    print("Answer:\n", result["answer"])
    print("\nSources:")
    for doc in result["context"]:
        print("-", doc.metadata.get("source", "Unknown source"))
        print(doc.page_content[:300], "...\n")



Answer:
 The main thesis of Soccernomics is that sports teams are not completely rational and that data analysis can be used to gain a competitive edge in the sport. It applies the lessons learned from Moneyball (a book about baseball) to the world of soccer, using statistics, economics, psychology, and intuition to transform the sport.

Sources:
- Soccernomics__World_Cup_Edition_-_Simon_Kuper.pdf
SOCCERNOMICS ...

- Soccernomics__World_Cup_Edition_-_Simon_Kuper.pdf
threat	rather	than	as	a	tool.”	Baseball	has	had	its	“Moneyball”	revolution,	but
in	soccer,	the	transformation	has	only	just	begun.	This	new,	updated,
expanded	edition	of	
Soccernomics—Soccernomics	3.0,
	as	we	think	of	it—uses
data	to	clarify	thinking	on	topics	ranging	from	tackles	through	transfers	 ...

- Soccernomics__World_Cup_Edition_-_Simon_Kuper.pdf
is
	one.”
—
Blogcritics
“A	 must	 read	 for	 any	 fan	 of	 the	 business	 of	 soccer.	 .	 .
.”
—
Footiebusiness.com
“
Soccernomics
	.	.	.	tackles	the	soccer	world’s	most	p

In [35]:
ask("What is the main thesis of Soccernomics?")



Answer:
 The main thesis of Soccernomics is that the lessons of Moneyball (sports teams are not completely rational) apply to the world's favorite sport, soccer. The book uses data, statistics, economics, psychology, and intuition to transform a dogmatic sport by putting it under an analytical microscope.

Sources:
- Soccernomics__World_Cup_Edition_-_Simon_Kuper.pdf
SOCCERNOMICS ...

- Soccernomics__World_Cup_Edition_-_Simon_Kuper.pdf
threat	rather	than	as	a	tool.”	Baseball	has	had	its	“Moneyball”	revolution,	but
in	soccer,	the	transformation	has	only	just	begun.	This	new,	updated,
expanded	edition	of	
Soccernomics—Soccernomics	3.0,
	as	we	think	of	it—uses
data	to	clarify	thinking	on	topics	ranging	from	tackles	through	transfers	 ...

- Soccernomics__World_Cup_Edition_-_Simon_Kuper.pdf
is
	one.”
—
Blogcritics
“A	 must	 read	 for	 any	 fan	 of	 the	 business	 of	 soccer.	 .	 .
.”
—
Footiebusiness.com
“
Soccernomics
	.	.	.	tackles	the	soccer	world’s	most	probing	questions	with	a
dispassi