In [1]:
# Cell 1: Install Dependencies
!pip install langchain langchain-openai langchain-community faiss-cpu pandas tiktoken python-dotenv gradio

Collecting langchain-openai
  Downloading langchain_openai-1.1.10-py3-none-any.whl.metadata (3.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading marshmallow-3.26.2-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-commu

In [5]:
# Cell 2: Imports & Setup
import os
import pandas as pd

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# API Key directly
os.environ["OPENAI_API_KEY"] = "your_openai_api_key_here"

# Initialize LLM and Embeddings
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
embeddings = OpenAIEmbeddings()

print("‚úÖ LLM and Embeddings initialized successfully!")

‚úÖ LLM and Embeddings initialized successfully!


In [6]:
# Cell 3: Upload and Load Datasets
from google.colab import files

print("üìÇ Please upload your 3 CSV files...")
uploaded = files.upload()

üìÇ Please upload your 3 CSV files...


Saving WorldCupMatches.csv to WorldCupMatches.csv
Saving WorldCups.csv to WorldCups.csv
Saving WorldCupPlayers.csv to WorldCupPlayers.csv


In [8]:
# Cell 4: Load Datasets
def load_datasets():
    world_cups = pd.read_csv('WorldCups.csv')
    matches = pd.read_csv('WorldCupMatches.csv')
    players = pd.read_csv('WorldCupPlayers.csv')

    print(f"‚úÖ WorldCups.csv loaded: {len(world_cups)} rows")
    print(f"‚úÖ WorldCupMatches.csv loaded: {len(matches)} rows")
    print(f"‚úÖ WorldCupPlayers.csv loaded: {len(players)} rows")

    return world_cups, matches, players

# Call the function
world_cups, matches, players = load_datasets()


‚úÖ WorldCups.csv loaded: 20 rows
‚úÖ WorldCupMatches.csv loaded: 4572 rows
‚úÖ WorldCupPlayers.csv loaded: 37784 rows


In [9]:
# Cell 5: Clean Matches Data
def clean_matches(matches):
    # Drop rows with missing values
    matches_clean = matches.dropna()

    # Fix data types
    matches_clean = matches_clean.copy()
    matches_clean['Year'] = matches_clean['Year'].astype(int)
    matches_clean['Home Team Goals'] = matches_clean['Home Team Goals'].astype(int)
    matches_clean['Away Team Goals'] = matches_clean['Away Team Goals'].astype(int)

    print(f"‚úÖ Original matches: {len(matches)} rows")
    print(f"‚úÖ Cleaned matches: {len(matches_clean)} rows")
    print(f"‚úÖ Dropped: {len(matches) - len(matches_clean)} rows")

    return matches_clean

# Call the function
matches_clean = clean_matches(matches)

‚úÖ Original matches: 4572 rows
‚úÖ Cleaned matches: 850 rows
‚úÖ Dropped: 3722 rows


In [10]:
# Quick verification
print("Sample of clean matches:")
print(matches_clean[['Year', 'Home Team Name', 'Away Team Name', 'Home Team Goals', 'Away Team Goals']].head(10))

Sample of clean matches:
   Year Home Team Name Away Team Name  Home Team Goals  Away Team Goals
0  1930         France         Mexico                4                1
1  1930            USA        Belgium                3                0
2  1930     Yugoslavia         Brazil                2                1
3  1930        Romania           Peru                3                1
4  1930      Argentina         France                1                0
5  1930          Chile         Mexico                3                0
6  1930     Yugoslavia        Bolivia                4                0
7  1930            USA       Paraguay                3                0
8  1930        Uruguay           Peru                1                0
9  1930          Chile         France                1                0


In [11]:
# Cell 6: Build Tournament Documents
def build_tournament_docs(world_cups):
    docs = []
    for _, row in world_cups.iterrows():
        content = (
            f"Year: {row['Year']}\n"
            f"Country: {row['Country']}\n"
            f"Winner: {row['Winner']}\n"
            f"Runners-Up: {row['Runners-Up']}\n"
            f"Third: {row['Third']}\n"
            f"Fourth: {row['Fourth']}\n"
            f"Goals Scored: {row['GoalsScored']}\n"
            f"Qualified Teams: {row['QualifiedTeams']}\n"
            f"Matches Played: {row['MatchesPlayed']}\n"
            f"Attendance: {row['Attendance']}\n"
        )
        docs.append(Document(page_content=content, metadata={"type": "tournament", "year": str(row['Year'])}))

    print(f"‚úÖ Built {len(docs)} tournament documents")
    return docs

tournament_docs = build_tournament_docs(world_cups)

‚úÖ Built 20 tournament documents


In [12]:
# Cell 7: Build Match Documents
def build_match_docs(matches_clean):
    docs = []
    for _, row in matches_clean.iterrows():
        content = (
            f"Year: {row['Year']}\n"
            f"Stage: {row['Stage']}\n"
            f"Home Team: {row['Home Team Name']}\n"
            f"Away Team: {row['Away Team Name']}\n"
            f"Home Goals: {row['Home Team Goals']}\n"
            f"Away Goals: {row['Away Team Goals']}\n"
            f"Attendance: {row['Attendance']}\n"
            f"City: {row['City']}\n"
            f"Stadium: {row['Stadium']}\n"
        )
        docs.append(Document(
            page_content=content,
            metadata={
                "type": "match",
                "year": str(row['Year']),
                "home_team": row['Home Team Name'],
                "away_team": row['Away Team Name']
            }
        ))

    print(f"‚úÖ Built {len(docs)} match documents")
    return docs

match_docs = build_match_docs(matches_clean)

‚úÖ Built 850 match documents


In [13]:
# Cell 8: Build Team Stats Documents
def build_team_stats(matches_clean):
    docs = []
    teams = set(matches_clean['Home Team Name'].tolist() + matches_clean['Away Team Name'].tolist())

    for team in teams:
        home = matches_clean[matches_clean['Home Team Name'] == team]
        away = matches_clean[matches_clean['Away Team Name'] == team]

        total_games = len(home) + len(away)
        total_goals_scored = home['Home Team Goals'].sum() + away['Away Team Goals'].sum()
        total_goals_conceded = home['Away Team Goals'].sum() + away['Home Team Goals'].sum()

        home_wins = len(home[home['Home Team Goals'] > home['Away Team Goals']])
        away_wins = len(away[away['Away Team Goals'] > away['Home Team Goals']])
        total_wins = home_wins + away_wins

        content = (
            f"Team: {team}\n"
            f"Total Games Played: {total_games}\n"
            f"Total Wins: {total_wins}\n"
            f"Total Goals Scored: {total_goals_scored}\n"
            f"Total Goals Conceded: {total_goals_conceded}\n"
            f"Years Participated: {sorted(matches_clean[(matches_clean['Home Team Name'] == team) | (matches_clean['Away Team Name'] == team)]['Year'].unique().tolist())}\n"
        )
        docs.append(Document(
            page_content=content,
            metadata={"type": "team_stats", "team": team}
        ))

    print(f"‚úÖ Built {len(docs)} team stat documents")
    return docs

team_docs = build_team_stats(matches_clean)

‚úÖ Built 83 team stat documents


In [14]:
# Cell 9: Build Head-to-Head Documents
def build_h2h_docs(matches_clean):
    docs = []
    matchups = {}

    for _, row in matches_clean.iterrows():
        home = row['Home Team Name']
        away = row['Away Team Name']
        key = tuple(sorted([home, away]))

        if key not in matchups:
            matchups[key] = []
        matchups[key].append(row)

    for (team1, team2), games in matchups.items():
        team1_wins = sum(1 for g in games if
            (g['Home Team Name'] == team1 and g['Home Team Goals'] > g['Away Team Goals']) or
            (g['Away Team Name'] == team1 and g['Away Team Goals'] > g['Home Team Goals']))
        team2_wins = sum(1 for g in games if
            (g['Home Team Name'] == team2 and g['Home Team Goals'] > g['Away Team Goals']) or
            (g['Away Team Name'] == team2 and g['Away Team Goals'] > g['Home Team Goals']))
        draws = len(games) - team1_wins - team2_wins

        content = (
            f"Head-to-Head: {team1} vs {team2}\n"
            f"Total Meetings: {len(games)}\n"
            f"{team1} Wins: {team1_wins}\n"
            f"{team2} Wins: {team2_wins}\n"
            f"Draws: {draws}\n"
            f"Years Played: {[g['Year'] for g in games]}\n"
        )
        docs.append(Document(
            page_content=content,
            metadata={"type": "h2h", "team1": team1, "team2": team2}
        ))

    print(f"‚úÖ Built {len(docs)} head-to-head documents")
    return docs

h2h_docs = build_h2h_docs(matches_clean)

‚úÖ Built 577 head-to-head documents


In [15]:
# Cell 10: Combine All Documents
all_docs = tournament_docs + match_docs + team_docs + h2h_docs

print(f"‚úÖ Total documents: {len(all_docs)}")

‚úÖ Total documents: 1530


In [16]:
# Cell 11: Split Documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

split_docs = text_splitter.split_documents(all_docs)

print(f"‚úÖ Original documents: {len(all_docs)}")
print(f"‚úÖ After splitting: {len(split_docs)} chunks")

‚úÖ Original documents: 1530
‚úÖ After splitting: 1530 chunks


In [17]:
# Cell 12: Build FAISS Vector Store
print("‚è≥ Building FAISS vector store... this may take a minute...")

vectorstore = FAISS.from_documents(
    documents=split_docs,
    embedding=embeddings
)

print(f"‚úÖ FAISS vector store built successfully!")
print(f"‚úÖ Total vectors indexed: {vectorstore.index.ntotal}")

‚è≥ Building FAISS vector store... this may take a minute...
‚úÖ FAISS vector store built successfully!
‚úÖ Total vectors indexed: 1530


In [18]:
# Cell 13: Save FAISS Index
vectorstore.save_local("faiss_worldcup_index")

print("‚úÖ FAISS index saved to 'faiss_worldcup_index/' folder!")

‚úÖ FAISS index saved to 'faiss_worldcup_index/' folder!


In [19]:
# Cell 14: Quick Test - Similarity Search
query = "Who won the 2014 World Cup?"

results = vectorstore.similarity_search(query, k=3)

print(f"üîç Query: {query}\n")
for i, doc in enumerate(results):
    print(f"--- Result {i+1} ---")
    print(doc.page_content)
    print()

üîç Query: Who won the 2014 World Cup?

--- Result 1 ---
Year: 2014
Country: Brazil
Winner: Germany
Runners-Up: Argentina
Third: Netherlands
Fourth: Brazil
Goals Scored: 171
Qualified Teams: 32
Matches Played: 64
Attendance: 3.386.810

--- Result 2 ---
Year: 2010
Country: South Africa
Winner: Spain
Runners-Up: Netherlands
Third: Germany
Fourth: Uruguay
Goals Scored: 145
Qualified Teams: 32
Matches Played: 64
Attendance: 3.178.856

--- Result 3 ---
Year: 1998
Country: France
Winner: France
Runners-Up: Brazil
Third: Croatia
Fourth: Netherlands
Goals Scored: 171
Qualified Teams: 32
Matches Played: 64
Attendance: 2.785.100



In [20]:
# Cell 15: Download FAISS index files
from google.colab import files

files.download('faiss_worldcup_index/index.faiss')
files.download('faiss_worldcup_index/index.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>