In [1]:
# Cell 1: Imports and Setup
import os
import pandas as pd

from langchain_ollama import ChatOllama
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.memory import ConversationBufferWindowMemory
from langchain.tools import tool
from langchain.agents import AgentExecutor, create_tool_calling_agent

# LLM - Ollama (runs locally)
llm = ChatOllama(model="llama3.2", temperature=0.3)

# Embeddings - HuggingFace (tiny, fast, no crashes)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

print("✅ Setup complete")

  from .autonotebook import tqdm as notebook_tqdm
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


✅ Setup complete


In [None]:
# Cell 2: Mock Vectorstore (Member 1 will replace this with the real FAISS index)
# Using real World Cup facts so your chains give realistic answers
#vectorstore = FAISS.load_local("faiss_worldcup_index", embeddings, allow_dangerous_deserialization=True)
mock_docs = [
    Document(page_content="The 2014 FIFA World Cup was held in Brazil. The winner was Germany. The runner-up was Argentina. Third place went to Netherlands and fourth place to Brazil. A total of 171 goals were scored across 64 matches. 32 teams qualified. Total attendance was 3429873.", metadata={"year": 2014, "type": "tournament"}),
    Document(page_content="The 2010 FIFA World Cup was held in South Africa. The winner was Spain. The runner-up was Netherlands. Third place went to Germany and fourth place to Uruguay. A total of 145 goals were scored across 64 matches. 32 teams qualified.", metadata={"year": 2010, "type": "tournament"}),
    Document(page_content="The 1998 FIFA World Cup was held in France. The winner was France. The runner-up was Brazil. Third place went to Croatia. A total of 171 goals were scored across 64 matches. 32 teams qualified.", metadata={"year": 1998, "type": "tournament"}),
    Document(page_content="The 2002 FIFA World Cup was held in Korea/Japan. The winner was Brazil. The runner-up was Germany. Third place went to Turkey. 161 goals were scored across 64 matches.", metadata={"year": 2002, "type": "tournament"}),
    Document(page_content="World Cup statistics for Brazil: Played 109 matches. Record: 73 wins, 18 draws, 18 losses. Goals scored: 229. Goals conceded: 105. Win rate: 67.0%. Won the World Cup in: 1958, 1962, 1970, 1994, 2002. Participated in every World Cup tournament.", metadata={"type": "team_stats", "team": "Brazil"}),
    Document(page_content="World Cup statistics for Germany: Played 106 matches. Record: 66 wins, 20 draws, 20 losses. Goals scored: 224. Goals conceded: 121. Win rate: 62.3%. Won the World Cup in: 1954, 1974, 1990, 2014.", metadata={"type": "team_stats", "team": "Germany"}),
    Document(page_content="World Cup statistics for Argentina: Played 77 matches. Record: 42 wins, 14 draws, 21 losses. Goals scored: 131. Goals conceded: 84. Win rate: 54.5%. Won the World Cup in: 1978, 1986.", metadata={"type": "team_stats", "team": "Argentina"}),
    Document(page_content="World Cup statistics for France: Played 59 matches. Record: 34 wins, 11 draws, 14 losses. Goals scored: 120. Goals conceded: 71. Win rate: 57.6%. Won the World Cup in: 1998.", metadata={"type": "team_stats", "team": "France"}),
    Document(page_content="Head-to-head World Cup record between Brazil and Germany: Met 5 times. Brazil wins: 3, Germany wins: 1, Draws: 1. Key matches: 2014 Semi-final: Brazil 1-7 Germany; 2002 Final: Brazil 2-0 Germany.", metadata={"type": "h2h", "team1": "Brazil", "team2": "Germany"}),
    Document(page_content="Head-to-head World Cup record between Argentina and Germany: Met 7 times. Germany wins: 4, Argentina wins: 2, Draws: 1. Key matches: 2014 Final: Germany 1-0 Argentina; 2010 Quarter-final: Germany 4-0 Argentina.", metadata={"type": "h2h", "team1": "Argentina", "team2": "Germany"}),
    Document(page_content="Head-to-head World Cup record between Brazil and France: Met 3 times. France wins: 2, Brazil wins: 1. Key matches: 1998 Final: Brazil 0-3 France; 2006 Quarter-final: Brazil 0-1 France.", metadata={"type": "h2h", "team1": "Brazil", "team2": "France"}),
    Document(page_content="In the 2014 World Cup Final, Germany played against Argentina. Score: Germany 1-0 Argentina (after extra time). Played at Maracana in Rio De Janeiro. Attendance: 74738.", metadata={"year": 2014, "type": "match", "stage": "Final"}),
    Document(page_content="In the 2014 World Cup Semi-final, Brazil played against Germany. Score: Brazil 1-7 Germany. Played at Estadio Mineirao in Belo Horizonte. Attendance: 58141.", metadata={"year": 2014, "type": "match", "stage": "Semi-finals"}),
    Document(page_content="In the 1998 World Cup Final, Brazil played against France. Score: Brazil 0-3 France. Played at Stade de France in Saint-Denis. Attendance: 80000.", metadata={"year": 1998, "type": "match", "stage": "Final"}),
]

vectorstore = FAISS.from_documents(mock_docs, embeddings)
print(f"✅ Mock vectorstore ready with {len(mock_docs)} documents")

✅ Mock vectorstore ready with 14 documents


In [3]:
# Cell 3: Create a Retriever and test it

retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# Test: does it find the right documents for a question?
results = retriever.invoke("Who won the 2014 World Cup?")

print(f"Search returned {len(results)} documents:\n")
for i, doc in enumerate(results):
    print(f"--- Result {i+1} ---")
    print(doc.page_content[:150])
    print()

Search returned 4 documents:

--- Result 1 ---
The 2014 FIFA World Cup was held in Brazil. The winner was Germany. The runner-up was Argentina. Third place went to Netherlands and fourth place to B

--- Result 2 ---
In the 2014 World Cup Final, Germany played against Argentina. Score: Germany 1-0 Argentina (after extra time). Played at Maracana in Rio De Janeiro. 

--- Result 3 ---
In the 2014 World Cup Semi-final, Brazil played against Germany. Score: Brazil 1-7 Germany. Played at Estadio Mineirao in Belo Horizonte. Attendance: 

--- Result 4 ---
The 2010 FIFA World Cup was held in South Africa. The winner was Spain. The runner-up was Netherlands. Third place went to Germany and fourth place to



In [4]:
# Cell 4: RAG Chain - System Prompt + Chain

QA_SYSTEM_PROMPT = """You are a World Cup expert analyst chatbot. Answer the user's question
using ONLY the provided context from historical FIFA World Cup data (1930-2014).

Rules:
- Base your answers strictly on the retrieved context below.
- If the context doesn't contain enough information, say so clearly.
- Provide specific stats, years, and match results when available.
- Be conversational and enthusiastic about football history.
- Note: This data covers World Cups from 1930 to 2014 only.

Context:
{context}
"""

qa_prompt = ChatPromptTemplate.from_messages([
    ("system", QA_SYSTEM_PROMPT),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}")
])

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

print("✅ RAG chain built")

✅ RAG chain built


In [5]:
# Cell 5: Add Memory + Ask Function

memory = ConversationBufferWindowMemory(
    k=5,
    memory_key="chat_history",
    return_messages=True
)

def ask_worldcup(question: str) -> str:
    """Ask a question about World Cup history with memory."""
    chat_history = memory.load_memory_variables({})["chat_history"]
    
    response = rag_chain.invoke({
        "input": question,
        "chat_history": chat_history
    })
    
    memory.save_context(
        {"input": question},
        {"output": response["answer"]}
    )
    
    return response["answer"]

print("✅ Memory + ask function ready")

✅ Memory + ask function ready


  memory = ConversationBufferWindowMemory(


In [6]:
# Cell 6: Test the RAG Chain

print("Q1: Who won the 2014 World Cup?")
print("A:", ask_worldcup("Who won the 2014 World Cup?"))
print()
print("=" * 50)
print()
print("Q2: How many goals were scored in that tournament?")
print("A:", ask_worldcup("How many goals were scored in that tournament?"))

Q1: Who won the 2014 World Cup?
A: Easy one! Germany won the 2014 FIFA World Cup, defeating Argentina 1-0 in the final after extra time, played at the iconic Maracana stadium in Rio De Janeiro!


Q2: How many goals were scored in that tournament?
A: In the 2014 FIFA World Cup, a total of 171 goals were scored across 64 matches. That's some exciting football!


In [7]:
# Cell 7: Prediction Chain - Prompt

PREDICTION_PROMPT = ChatPromptTemplate.from_messages([
    ("system", """You are a World Cup match analyst. Given historical data about two teams,
produce a short match preview and predicted outcome.

Use the following context which includes team stats, head-to-head records, and match history:

{context}

Your response MUST include:
1. **Team Profiles** - Key stats for each team (matches played, win rate, goals, titles)
2. **Head-to-Head Record** - Historical matchups between these two teams
3. **Key Factors** - What advantages each team has
4. **Prediction** - Predicted score and winner with reasoning
5. **Confidence Level** - Low / Medium / High based on data availability

IMPORTANT: This prediction is based on historical data (1930-2014) and is for educational 
purposes only, not professional sports analytics advice."""),
    ("human", "Predict the match outcome: {team1} vs {team2}")
])

prediction_chain = create_stuff_documents_chain(llm, PREDICTION_PROMPT)

print("✅ Prediction chain built")

✅ Prediction chain built


In [8]:
# Cell 8: Predict Match Function (multi-query retrieval)

def predict_match(team1: str, team2: str) -> str:
    """Generate a match prediction between two teams."""
    
    # Search 1: Head-to-head history
    h2h_docs = vectorstore.similarity_search(
        f"{team1} vs {team2} head to head World Cup", k=3
    )
    
    # Search 2: Team 1 stats
    team1_docs = vectorstore.similarity_search(
        f"{team1} World Cup statistics record", k=2
    )
    
    # Search 3: Team 2 stats
    team2_docs = vectorstore.similarity_search(
        f"{team2} World Cup statistics record", k=2
    )
    
    # Combine and remove duplicates
    all_docs = h2h_docs + team1_docs + team2_docs
    seen = set()
    unique_docs = []
    for doc in all_docs:
        if doc.page_content not in seen:
            seen.add(doc.page_content)
            unique_docs.append(doc)
    
    print(f"   Retrieved {len(unique_docs)} unique documents for prediction")
    
    # Run prediction chain
    response = prediction_chain.invoke({
        "context": unique_docs,
        "team1": team1,
        "team2": team2
    })
    
    return response

print("✅ Predict function ready")

✅ Predict function ready


In [9]:
# Cell 9: Test Prediction

print("MATCH PREDICTION: Brazil vs Germany")
print(predict_match("Brazil", "Germany"))

MATCH PREDICTION: Brazil vs Germany
   Retrieved 5 unique documents for prediction
**Match Preview: Brazil vs Germany**

**Team Profiles:**

* **Brazil:** Played 109 matches, Record: 73 wins, 18 draws, 18 losses, Win rate: 67.0%, Goals scored: 229, Goals conceded: 105.
* **Germany:** Played 106 matches, Record: 66 wins, 20 draws, 20 losses, Win rate: 62.3%, Goals scored: 224, Goals conceded: 121.

**Head-to-Head Record:**

* Brazil vs Germany World Cup record: Met 5 times. Brazil wins: 3, Germany wins: 1, Draws: 1.
* Key matches:
	+ 2014 Semi-final: Brazil 1-7 Germany
	+ 2002 Final: Brazil 2-0 Germany

**Key Factors:**

* **Brazil's attacking prowess:** Brazil has a strong record in the World Cup, with a high win rate and goals scored. They have a talented squad with players like Neymar, Gabriel Jesus, and Richarlison.
* **Germany's defensive solidity:** Germany is known for their solid defense, which has been a key factor in their success in the World Cup. They have a strong team shap

In [10]:
# Cell 10: Define Agent Tools

@tool
def dataset_discovery_tool(query: str) -> str:
    """Discover what data is available in the World Cup dataset.
    Use this when the user asks what data you have, what years are covered,
    or what teams/tournaments are in the dataset."""
    return (
        "The dataset covers 20 FIFA World Cup tournaments from 1930 to 2014.\n"
        "Total matches: 852\n"
        "Total teams: 83\n"
        "Data includes: tournament summaries, match-level results, team statistics, "
        "head-to-head records, and player participation records.\n"
        "Teams include: Brazil, Germany, Argentina, France, Italy, Spain, "
        "England, Netherlands, Uruguay, and 74 more."
    )

@tool
def data_ingestion_tool(team_name: str) -> str:
    """Look up raw statistical data for a specific team from the dataset.
    Returns match counts, goals, and tournament participation.
    Use this when the user asks for a specific team's stats or record."""
    docs = vectorstore.similarity_search(f"{team_name} World Cup statistics record", k=2)
    if docs:
        return docs[0].page_content
    return f"No data found for '{team_name}'. Check the team name spelling."

@tool
def retrieval_or_filter_tool(question: str) -> str:
    """Search the World Cup knowledge base to answer factual questions about
    World Cup history, tournament results, match outcomes, and team records.
    Use this for any question about World Cup facts and history."""
    docs = vectorstore.similarity_search(question, k=6)
    return "\n\n".join([d.page_content for d in docs])

@tool
def reasoning_or_aggregation_tool(matchup: str) -> str:
    """Generate a match prediction between two teams. Input should be in
    the format 'Team1 vs Team2', e.g., 'Brazil vs Germany'.
    Analyzes head-to-head records and team stats to predict outcomes."""
    parts = matchup.split(" vs ")
    if len(parts) != 2:
        return "Please provide the matchup in format: 'Team1 vs Team2'"
    return predict_match(parts[0].strip(), parts[1].strip())

@tool
def report_generation_tool(topic: str) -> str:
    """Generate a structured report or summary about a World Cup topic.
    Good for questions like 'summarize the 1998 World Cup' or
    'give me a report on Brazil's World Cup history'."""
    docs = vectorstore.similarity_search(topic, k=8)
    context = "\n\n".join([d.page_content for d in docs])
    
    report_prompt = ChatPromptTemplate.from_messages([
        ("system", "Generate a well-structured report on the given topic using the context. "
         "Include relevant statistics, match results, and historical context. "
         "Note this covers World Cups 1930-2014.\n\nContext:\n{context}"),
        ("human", "{topic}")
    ])
    chain = report_prompt | llm
    return chain.invoke({"context": context, "topic": topic}).content

tools = [
    dataset_discovery_tool,
    data_ingestion_tool,
    retrieval_or_filter_tool,
    reasoning_or_aggregation_tool,
    report_generation_tool
]

print(f"✅ {len(tools)} tools defined")

✅ 5 tools defined


In [11]:
# Cell 11: Build the Agent

AGENT_SYSTEM_PROMPT = """You are the World Cup AI Analyst, an expert chatbot on FIFA World Cup 
history from 1930 to 2014. You have access to a comprehensive database of tournament results,
match scores, team statistics, and head-to-head records.

You have the following tools:
- dataset_discovery_tool: Find what data is available
- data_ingestion_tool: Look up raw stats for a specific team
- retrieval_or_filter_tool: Search the knowledge base for factual answers
- reasoning_or_aggregation_tool: Predict match outcomes (use 'Team1 vs Team2' format)
- report_generation_tool: Generate structured reports on topics

Guidelines:
- Always use your tools to ground responses in real data.
- Be enthusiastic and conversational about football.
- Cite specific stats, scores, and years when possible.
- For predictions, always note they are educational and based on historical data only.
- If data is insufficient, say so honestly.
"""

agent_prompt = ChatPromptTemplate.from_messages([
    ("system", AGENT_SYSTEM_PROMPT),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}"),
    MessagesPlaceholder(variable_name="agent_scratchpad")
])

agent = create_tool_calling_agent(llm, tools, agent_prompt)

agent_memory = ConversationBufferWindowMemory(
    k=5,
    memory_key="chat_history",
    return_messages=True
)

agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    memory=agent_memory,
    verbose=True,
    handle_parsing_errors=True,
    max_iterations=5
)

print("✅ Agent ready!")

✅ Agent ready!


In [13]:
# Cell 12: Test the Agent

print("TEST 1: Factual question")
response = agent_executor.invoke({"input": "Who won the 2014 World Cup?"})
print("\nFinal Answer:", response["output"])

TEST 1: Factual question


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `retrieval_or_filter_tool` with `{'question': 'Germany vs Argentina 2014 World Cup final score'}`


[0m[38;5;200m[1;3mIn the 2014 World Cup Final, Germany played against Argentina. Score: Germany 1-0 Argentina (after extra time). Played at Maracana in Rio De Janeiro. Attendance: 74738.

Head-to-head World Cup record between Argentina and Germany: Met 7 times. Germany wins: 4, Argentina wins: 2, Draws: 1. Key matches: 2014 Final: Germany 1-0 Argentina; 2010 Quarter-final: Germany 4-0 Argentina.

In the 2014 World Cup Semi-final, Brazil played against Germany. Score: Brazil 1-7 Germany. Played at Estadio Mineirao in Belo Horizonte. Attendance: 58141.

The 2014 FIFA World Cup was held in Brazil. The winner was Germany. The runner-up was Argentina. Third place went to Netherlands and fourth place to Brazil. A total of 171 goals were scored across 64 matches. 32 teams qualified. Total attenda

In [17]:
# Cell 13: Test prediction and memory

print("TEST 2: Match Prediction")
response = agent_executor.invoke({"input": "Predict Brazil vs Germany"})
print("\nFinal Answer:", response["output"])

TEST 2: Match Prediction


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `reasoning_or_aggregation_tool` with `{'matchup': 'Brazil vs Germany'}`


[0m   Retrieved 5 unique documents for prediction
[36;1m[1;3m**Match Preview: Brazil vs Germany**

**Team Profiles:**

* **Brazil:** Played 109 matches, Record: 73 wins (67.0%), 18 draws, 18 losses. Goals scored: 229, Goals conceded: 105.
	+ Recent form: Won 3 of last 5 matches in the World Cup.
	+ Key players: Neymar Jr., Gabriel Jesus, Alisson Becker.
* **Germany:** Played 106 matches, Record: 66 wins (62.3%), 20 draws, 20 losses. Goals scored: 224, Goals conceded: 121.
	+ Recent form: Won 2 of last 5 matches in the World Cup.
	+ Key players: Thomas Müller, Joshua Kimmich, Manuel Neuer.

**Head-to-Head Record:**

* Met 5 times in the World Cup. Brazil wins: 3, Germany wins: 1, Draws: 1.
* Key matches:
	+ 2014 Semi-final: Brazil 1-7 Germany
	+ 2002 Final: Brazil 2-0 Germany

**Key Factors:**

* **Brazil's attackin

In [18]:
# Cell 14: Test memory (follow-up question)

print("TEST 3: Memory test")
response = agent_executor.invoke({"input": "How many goals were scored in that tournament we discussed?"})
print("\nFinal Answer:", response["output"])

TEST 3: Memory test


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `data_ingestion_tool` with `{'team_name': 'Brazil'}`


[0m[33;1m[1;3mWorld Cup statistics for Brazil: Played 109 matches. Record: 73 wins, 18 draws, 18 losses. Goals scored: 229. Goals conceded: 105. Win rate: 67.0%. Won the World Cup in: 1958, 1962, 1970, 1994, 2002. Participated in every World Cup tournament.[0m[32;1m[1;3mAccording to the data ingestion tool, a total of 229 goals were scored by Brazil during their participation in the 2014 FIFA World Cup.[0m

[1m> Finished chain.[0m

Final Answer: According to the data ingestion tool, a total of 229 goals were scored by Brazil during their participation in the 2014 FIFA World Cup.


In [19]:
# Cell 15: Test report generation

print("TEST 4: Report generation")
response = agent_executor.invoke({"input": "Give me a report on the 1998 World Cup"})
print("\nFinal Answer:", response["output"])

TEST 4: Report generation


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `report_generation_tool` with `{'topic': '1998 World Cup'}`


[0m[33;1m[1;3m**1998 FIFA World Cup Report**

**Introduction**

The 1998 FIFA World Cup was the 16th edition of the FIFA World Cup, held in France from June 10 to July 12, 1998. The tournament featured 32 national teams and was won by France, who defeated Brazil 3-0 in the final.

**Historical Context**

The 1998 World Cup was the first time that the tournament was held in a European country since the 1986 edition in Mexico. It was also the first time that the tournament was held in June, rather than the traditional summer months of July and August.

**Match Results and Statistics**

* Total matches played: 64
* Total goals scored: 171
* Average attendance per match: 39,000
* Top scorer: Davor Šuker (Croatia) - 6 goals

**Final Match**

The final match between Brazil and France was played at the Stade de France in Saint-Denis