In [1]:
%pip install langchain langchain-community langchain-huggingface neo4j pandas spacy trans
!python -m spacy download en_core_web_sm

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.1/12.8 MB 1.1 MB/s eta 0:00:13
     ---------------------------------------- 0.1/12.8 MB 1.1 MB/s eta 0:00:13
     --------------------------------------- 0.1/12.8 MB 944.1 kB/s eta 0:00:14
      -------------------------------------- 0.2/12.8 MB 980.4 kB/s eta 0:00:13
      --------------------------------------- 0.2/12.8 MB 1.1 MB/s eta 0:00:12
      --------------------------------------- 0.3/12.8 MB 1.0 MB/s eta 0:00:13
      --------------------------------------- 0.3/12.8 MB 1.0 MB/s eta 0:00:13
     - -------------------------------------- 0.4/12.


[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from langchain_community.graphs import Neo4jGraph
from neo4j import GraphDatabase

In [3]:
config = {}

with open('config.txt', 'r') as file:
    for line in file:
        if "=" in line:
            key, value = line.split('=', 1)
            config[key.strip()] = value.strip()

uri = config.get('URI')
username = config.get('USERNAME')
password = config.get('PASSWORD')
driver = GraphDatabase.driver(uri, auth=(username, password))
print("Connected to Neo4j database")

Connected to Neo4j database


In [4]:
# Connect using the LangChain wrapper
graph = Neo4jGraph(
    url=uri,
    username=username,
    password=password,
    refresh_schema= False
)
# Ensure the connection is working by running a quick query (optional)
print(graph.query("MATCH (s:Season) RETURN s"))

  graph = Neo4jGraph(


[{'s': {'season_name': '2021-22'}}, {'s': {'season_name': '2022-23'}}]


In [5]:
import spacy
import re

nlp = spacy.load("en_core_web_sm")
# Load the kb from the graph database (optional enhancement)
def load_fpl_kb(graph: Neo4jGraph) -> dict:
    kb = {
        "players": [],
        "teams": [],
        "positions": ["gk","gkp", "def", "mid", "fwd", "goalkeeper", "defender", "midfielder", "forward"],
        "stats": {}
    }

    # Load players
    player_results = graph.query("MATCH (p:Player) RETURN p.player_name AS name")
    kb["players"] = [record["name"].lower() for record in player_results]

    # Load teams
    team_results = graph.query("MATCH (t:Team) RETURN t.name AS name")
    kb["teams"] = [record["name"].lower() for record in team_results]

    # Load stats mapping
    kb["stats"] = {
        "points": "total_points",
        "goals": "goals_scored",
        "assists": "assists",
        "minutes": "minutes",
        "bonus": "bonus",
        "influence": "influence",
        "creativity": "creativity",
        "threat": "threat",
        "ict": "ict_index",
        "clean sheets": "clean_sheets",
        "form": "form"
    }

    return kb

In [6]:
FPL_KB = load_fpl_kb(graph)

In [7]:
def classify_fpl_intent(query: str) -> str:
    q = query.lower()

    # 1. Comparisons
    if any(x in q for x in ["better", "compare", "more than", " vs "]) and "player" not in q:
        # "Who has more points, Salah or Son?"
        return "PLAYER_STATS_COMPARE"
    
    # 2. Recommendations (Buying/Form)
    if any(x in q for x in ["recommend", "who to buy", "transfer", "best option", "good form"]):
        return "RECOMMENDATION_FORM"

    # 3. Fixtures (Future)
    if any(x in q for x in ["next match", "play next", "upcoming", "fixture"]):
        return "TEAM_FIXTURE_FIND"
    
    # 4. Fixtures (History/Results)
    if any(x in q for x in ["score", "result", "did they play", "won", "lost"]):
        return "TEAM_FIXTURE_HISTORY"

    # 5. Top Players (Rankings)
    if any(x in q for x in ["top", "most", "highest", "best"]) and any(x in q for x in ["player", "scorer", "mid", "fwd", "def"]):
        if any(x in q for x in FPL_KB["positions"]):
            return "POSITION_RANKING"
        return "PLAYER_STATS_TOP"

    # 6. Team Analysis
    if "team" in q and any(x in q for x in ["stats", "average", "conceded"]):
        return "TEAM_ANALYSIS_AVG"

    # 7. Detailed Player Stats
    if "stats" in q or "how many" in q or "points" in q or "goals" in q:
        if "gw" in q or "gameweek" in q:
            return "PLAYER_STATS_DETAIL_GW"
        return "PLAYER_STATS_DETAIL_TOTAL"
    
    # 8. Bonus Analysis
    if "bonus" in q:
        return "BONUS_ANALYSIS"

    # 9. Complex Filter
    if ">" in q or "<" in q or "more than" in q:
        return "COMPLEX_FILTER"

    # Fallback
    return "METADATA_QUERY"

In [8]:
ENTITY_LOOKUP = {}

def add_to_lookup(terms, category):
    for item in terms:
        # If it's a dict (like stats), the item is the key, canonical is the value
        if isinstance(terms, dict):
            value = terms[item]
            key = item
        else:
            value = item.title()
            key = item
        
        ENTITY_LOOKUP[key.lower()] = (category, value)
add_to_lookup(FPL_KB["players"], "player")
add_to_lookup(FPL_KB["teams"], "team")
add_to_lookup(FPL_KB["positions"], "position")
add_to_lookup(FPL_KB["stats"], "stat")

In [38]:
def extract_fpl_entities(query: str) -> dict:
    
    doc = nlp(query)
    entities = {
        "stat_type": "total_points" # Default fallback
    }
    
    query_lower = query.lower()
    
    for token in doc:
        text = token.text.lower()
        lemma = token.lemma_.lower()
        
        match = ENTITY_LOOKUP.get(text) or ENTITY_LOOKUP.get(lemma)
        
        if match:
            category, value = match
            
            # 1. Handle Players
            if category == "player":
                if "player1" not in entities:
                    entities["player1"] = value
                    entities["player_name"] = value
                elif entities["player1"] != value: # Avoid self-match
                    entities["player2"] = value
            
            # 2. Handle Teams
            elif category == "team":
                if "team1" not in entities:
                    entities["team1"] = value
                    entities["team_name"] = value
                elif entities["team1"] != value:
                    entities["team2"] = value
            
            # 3. Handle Positions
            elif category == "position":
                # Normalize Aliases
                norm = value.upper()
                if "MID" in norm: norm = "MID"
                elif "FWD" in norm or "FORWARD" in norm: norm = "FWD"
                elif "DEF" in norm: norm = "DEF"
                elif "GK" in norm or "GKP" in norm or "GOALKEEPER" in norm: norm = "GKP"
                entities["position"] = norm
            
            # 4. Handle Stats
            elif category == "stat":
                entities["stat_type"] = value

        # Check for numeric values that are NOT part of the season or GW (simple heuristic)
        if token.like_num:
            # Avoid overwriting if regex captured it, but here we capture loose numbers
            # e.g. "more than 5"
            try:
                val = float(token.text)
                # Heuristic: Filter values are usually small integers (< 2000) unlike years
                if val < 1000 and "gw" not in query_lower: 
                     entities["filter_value"] = val
            except:
                pass

    # B. Regex Extraction (Best for strict patterns like 'GW 10' or Years)
    
    # Extract "Gameweek X" or "GW X"
    gw_match = re.search(r"(?:gw|gameweek)\s*(\d+)", query_lower)
    if gw_match:
        entities["gw_number"] = int(gw_match.group(1))

    # Extract Season (e.g., "2022", "23/24")
    season_match = re.search(r"(20\d{2}-\d{2})", query_lower)
    if season_match:
        entities["season"] = season_match.group(1)

    return entities

In [22]:
def get_fpl_cypher_query(intent: str, entities: dict) -> dict:
    
    # Setup parameters with safe defaults to prevent NoneType errors
    params = {
        "player1": entities.get("player1", ""),
        "player2": entities.get("player2", ""),
        "player_name": entities.get("player_name", ""),
        "team_name": entities.get("team_name", ""),
        "team1": entities.get("team1", ""),
        "team2": entities.get("team2", ""),
        "position": entities.get("position", "MID"),
        "season": entities.get("season", "2022-23"),
        "gw_number": entities.get("gw_number", 1),
        "min_value": entities.get("filter_value", 0),
        "current_gw": 38 # In a real app, fetch current GW dynamically
    }
    
    # Dynamic property injection (SAFE because we map from a controlled dictionary above)
    stat_prop = entities.get("stat_type", "total_points")

    # The Library of Queries
    queries = {
        "PLAYER_STATS_COMPARE": f"""
            MATCH (p1:Player {{player_name: $player1}})-[r1:PLAYED_IN]->(:Fixture)-[:HAS_GW]->(:Gameweek {{season: $season}})
            MATCH (p2:Player {{player_name: $player2}})-[r2:PLAYED_IN]->(:Fixture)-[:HAS_GW]->(:Gameweek {{season: $season}})
            RETURN p1.player_name, sum(r1.{stat_prop}) AS P1_Stat, 
                   p2.player_name, sum(r2.{stat_prop}) AS P2_Stat
        """,

        "PLAYER_STATS_TOP": f"""
            MATCH (p:Player)-[r:PLAYED_IN]->(:Fixture)<-[:HAS_FIXTURE]-(:Gameweek)<-[HAS_GW]-(:Season {{season_name: $season}})
            RETURN p.player_name, sum(r.{stat_prop}) AS TotalStat 
            ORDER BY TotalStat DESC LIMIT 10
        """,

        "PLAYER_STATS_DETAIL_TOTAL": f"""
            MATCH (p:Player {{player_name: $player_name}})-[r:PLAYED_IN]->(:Fixture)-[:HAS_GW]->(:Gameweek {{season: $season}})
            RETURN p.player_name, sum(r.total_points) as Points, sum(r.goals_scored) as Goals, sum(r.assists) as Assists
        """,

        "PLAYER_STATS_DETAIL_GW": f"""
            MATCH (p:Player {{player_name: $player_name}})-[r:PLAYED_IN]->(f:Fixture)
            MATCH (f)-[:HAS_GW]->(gw:Gameweek {{GW_number: $gw_number, season: $season}})
            RETURN p.player_name, gw.GW_number, r.total_points, r.goals_scored, r.minutes
        """,

        "TEAM_FIXTURE_FIND": """
            MATCH (t:Team {name: $team_name})<-[:HAS_HOME_TEAM|:HAS_AWAY_TEAM]-(f:Fixture)-[:HAS_GW]->(gw:Gameweek)
            WHERE f.kickoff_time >= date()
            RETURN gw.GW_number, f.kickoff_time, t.name
            ORDER BY f.kickoff_time ASC LIMIT 3
        """,
        
        "TEAM_FIXTURE_HISTORY": """
            MATCH (t1:Team {name: $team1})<-[:HAS_HOME_TEAM]-(f:Fixture)-[:HAS_AWAY_TEAM]->(t2:Team {name: $team2})
            RETURN f.fixture_number, f.home_score, f.away_score
        """,

        "POSITION_RANKING": f"""
            MATCH (p:Player)-[:PLAYS_AS]->(pos:Position {{name: $position}})
            MATCH (p)-[r:PLAYED_IN]->(:Fixture)-[:HAS_GW]->(:Gameweek {{season: $season}})
            RETURN p.player_name, sum(r.{stat_prop}) AS PositionStat 
            ORDER BY PositionStat DESC LIMIT 10
        """,

        "RECOMMENDATION_FORM": """
            MATCH (p:Player)-[r:PLAYED_IN]->(f:Fixture)-[:HAS_GW]->(gw:Gameweek)
            WHERE gw.GW_number >= ($current_gw - 5) AND p.position = $position
            RETURN p.player_name, avg(r.form) AS AvgForm
            ORDER BY AvgForm DESC LIMIT 5
        """,
        
        "BONUS_ANALYSIS": """
            MATCH (p:Player)-[r:PLAYED_IN]->(:Fixture)-[:HAS_GW]->(:Gameweek {season: $season})
            RETURN p.player_name, sum(r.bonus) as TotalBonus
            ORDER BY TotalBonus DESC LIMIT 10
        """,

        "METADATA_QUERY": "MATCH (n:Player) RETURN count(n) as PlayerCount"
    }

    # Retrieve the query template
    query_template = queries.get(intent, queries["METADATA_QUERY"])

    return {
        "query": query_template,
        "params": params
    }

In [11]:
from langchain_core.language_models import LLM
from typing import Optional, List, Any
from pydantic import Field

class GemmaLangChainWrapper(LLM):
    client: Any = Field(...)
    max_tokens: int = 500
    
    @property
    def _llm_type(self) -> str:
        return "gemma_hf_api"
    
    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        response = self.client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=self.max_tokens,
            temperature=0.2 
        )
        return response.choices[0].message["content"]


  from .autonotebook import tqdm as notebook_tqdm


In [12]:
%pip install -U langchain langchain-community langchain-core pydantic typing-extensions

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
%pip install langchain langchain-community langchain-core langchain-huggingface

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
%pip install langchain-classic

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# 1. New Imports
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
# Keep these standard imports
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
from huggingface_hub import InferenceClient

HF_TOKEN = config.get('HF_TOKEN')

def initialize_rag_pipeline(query_result: str = ""):
    # --- Setup Vector Store (Same as before) ---
    splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100)
    documents = splitter.create_documents([query_result])
    
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(documents=documents, embedding=embedding_model)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

    client = InferenceClient(
        model="google/gemma-2-2b-it",
        token=HF_TOKEN
    )
    
    gemma_llm = GemmaLangChainWrapper(client=client)
    
    # 1. Create a prompt specifically for the LLM to process chunks
    prompt = ChatPromptTemplate.from_template("""
    Answer the user's question based on the context below:
    
    <context>
    {context}
    </context>

    Question: {input}
    """)

    # 2. Create the "Stuff" chain (combines retrieved docs into the prompt)
    document_chain = create_stuff_documents_chain(gemma_llm, prompt)

    # 3. Create the final Retrieval Chain (connects retriever -> document chain)
    qa_chain = create_retrieval_chain(retriever, document_chain)
    
    return qa_chain

In [48]:
query = "Who is the top 3 players when it comes to total points in season 2021-22?"
query_result = ""
intent = classify_fpl_intent(query)
print("Intent:", intent)
entities = extract_fpl_entities(query)
print("Entities:", entities)
cypher_info = get_fpl_cypher_query(intent, entities)
print("Cypher Query:", cypher_info["query"])
cypher_query = cypher_info["query"]
print("Cypher Params:", cypher_info["params"])
query_result = graph.query(cypher_query, cypher_info["params"])
print(query_result)

Intent: PLAYER_STATS_TOP
Entities: {'stat_type': 'total_points', 'filter_value': 22.0, 'season': '2021-22'}
Cypher Query: 
            MATCH (p:Player)-[r:PLAYED_IN]->(:Fixture)<-[:HAS_FIXTURE]-(:Gameweek)<-[HAS_GW]-(:Season {season_name: $season})
            RETURN p.player_name, sum(r.total_points) AS TotalStat 
            ORDER BY TotalStat DESC LIMIT 10
        
Cypher Params: {'player1': '', 'player2': '', 'player_name': '', 'team_name': '', 'team1': '', 'team2': '', 'position': 'MID', 'season': '2021-22', 'gw_number': 1, 'min_value': 22.0, 'current_gw': 38}
[{'p.player_name': 'Mohamed Salah', 'TotalStat': 265}, {'p.player_name': 'Heung-Min Son', 'TotalStat': 258}, {'p.player_name': 'Trent Alexander-Arnold', 'TotalStat': 208}, {'p.player_name': 'Jarrod Bowen', 'TotalStat': 206}, {'p.player_name': 'JoÃ£o Pedro Cavaco Cancelo', 'TotalStat': 201}, {'p.player_name': 'Kevin De Bruyne', 'TotalStat': 196}, {'p.player_name': 'Harry Kane', 'TotalStat': 192}, {'p.player_name': 'Andrew Rob

In [49]:
query_result = str(query_result)
rag_chain = initialize_rag_pipeline(query_result)
response = rag_chain.invoke({"input": query})
print(response["answer"])

Based on the provided context, here are the top 3 players with the highest total points in the 2021-22 season:

1. **Mohamed Salah:** 265 points
2. **Heung-Min Son:** 258 points
3. **Kevin De Bruyne:** 196 points 



In [27]:
%pip install faiss-gpu

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)
ERROR: No matching distribution found for faiss-gpu

[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Test 1: Comparison
user_query_1 = "Who has more goals, Salah or Haaland?"
print(f"Query: {user_query_1}")

ents_1 = extract_fpl_entities(user_query_1)
intent_1 = classify_fpl_intent(user_query_1)
result_1 = get_fpl_cypher_query(intent_1, ents_1)

print(f"Entities: {ents_1}")
print(f"Intent: {intent_1}")
print(f"Cypher Params: {result_1['params']}")
print("-" * 30)

# Test 2: Specific GW Stats
user_query_2 = "How many points did Saka get in Gameweek 5?"
print(f"Query: {user_query_2}")

ents_2 = extract_fpl_entities(user_query_2)
intent_2 = classify_fpl_intent(user_query_2)
result_2 = get_fpl_cypher_query(intent_2, ents_2)

print(f"Entities: {ents_2}")
print(f"Intent: {intent_2}")
print(f"Cypher Params: {result_2['params']}")