## Exploring the data

In [3]:
import pandas as pd

In [14]:
results_csv_path = "https://raw.githubusercontent.com/martj42/international_results/refs/heads/master/results.csv"
goalscorers_csv_path = "https://raw.githubusercontent.com/martj42/international_results/refs/heads/master/goalscorers.csv"
shootouts_csv_path = "https://raw.githubusercontent.com/martj42/international_results/refs/heads/master/shootouts.csv"

csv_paths = [results_csv_path, goalscorers_csv_path, shootouts_csv_path]

In [31]:
results_df = pd.read_csv(results_csv_path, parse_dates=['date'])
goalscorers_df = pd.read_csv(goalscorers_csv_path, parse_dates=['date'])
shootouts_df = pd.read_csv(shootouts_csv_path, parse_dates=['date'])

In [32]:
results_df.sample(5)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
3663,1953-03-07,Netherlands,Denmark,1,2,Friendly,Rotterdam,Netherlands,False
23053,1998-09-25,Australia,Fiji,3,1,Oceania Nations Cup,Brisbane,Australia,False
43743,2021-03-27,Wales,Mexico,1,0,Friendly,Cardiff,Wales,False
29197,2005-06-05,Mali,Liberia,4,1,FIFA World Cup qualification,Ségou,Mali,False
1370,1929-11-03,Trinidad and Tobago,Guyana,3,1,Friendly,Port of Spain,Trinidad and Tobago,False


In [33]:
results_df.shape

(47598, 9)

In [16]:
shootouts_df.sample(5)

Unnamed: 0,date,home_team,away_team,winner,first_shooter
386,2007-05-06,Guernsey,Jersey,Jersey,
360,2005-05-15,Jersey,Guernsey,Guernsey,
394,2007-07-29,Botswana,Angola,Botswana,
528,2018-07-03,Colombia,England,England,Colombia
107,1987-06-21,South Korea,Australia,South Korea,


In [17]:
goalscorers_df.sample(5)

Unnamed: 0,date,home_team,away_team,team,scorer,minute,own_goal,penalty
41663,2021-10-11,Croatia,Slovakia,Slovakia,Ivan Schranz,20.0,False,False
5947,1970-10-14,Denmark,Portugal,Portugal,Jacinto João,40.0,False,False
27494,2007-11-21,Hungary,Greece,Greece,Vilmos Vanczák,22.0,True,False
7993,1977-10-09,Denmark,Portugal,Portugal,Nené,35.0,False,False
28140,2008-06-20,Libya,Lesotho,Libya,Hesham Shaban,81.0,False,False


In [18]:
shootouts_df.isnull().sum()

date               0
home_team          0
away_team          0
winner             0
first_shooter    415
dtype: int64

In [19]:
results_df.isnull().sum()

date          0
home_team     0
away_team     0
home_score    0
away_score    0
tournament    0
city          0
country       0
neutral       0
dtype: int64

In [20]:
goalscorers_df.isnull().sum()

date           0
home_team      0
away_team      0
team           0
scorer        49
minute       259
own_goal       0
penalty        0
dtype: int64

In [25]:
shootouts_df['date'].min()

Timestamp('1967-08-22 00:00:00')

In [26]:
goalscorers_df['date'].min()

Timestamp('1916-07-02 00:00:00')

In [27]:
results_df['date'].min()

Timestamp('1872-11-30 00:00:00')

In [28]:
goalscorers_df.columns

Index(['date', 'home_team', 'away_team', 'team', 'scorer', 'minute',
       'own_goal', 'penalty'],
      dtype='object')

In [29]:
shootouts_df.columns

Index(['date', 'home_team', 'away_team', 'winner', 'first_shooter'], dtype='object')

In [30]:
results_df.columns

Index(['date', 'home_team', 'away_team', 'home_score', 'away_score',
       'tournament', 'city', 'country', 'neutral'],
      dtype='object')

### Data ingestion

In [54]:
import neo4j
from dotenv import load_dotenv
import os 

load_dotenv()

uri = os.getenv("NEO4J_URI")
user = os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")

try:
    driver = neo4j.GraphDatabase.driver(uri, auth=(user, password))
    print("Connected to Neo4j instance successfully!")
except Exception as e:
    print(f"Failed to connect to Neo4j: {e}")

Connected to Neo4j instance successfully!


In [55]:
from tqdm import tqdm
import logging

def create_indexes(session):
    indexes = [
        "CREATE INDEX IF NOT EXISTS FOR (t:Team) ON (t.name)",
        "CREATE INDEX IF NOT EXISTS FOR (m:Match) ON (m.id)",
        "CREATE INDEX IF NOT EXISTS FOR (p:Player) ON (p.name)",
        "CREATE INDEX IF NOT EXISTS FOR (t:Tournament) ON (t.name)",
        "CREATE INDEX IF NOT EXISTS FOR (c:City) ON (c.name)",
        "CREATE INDEX IF NOT EXISTS FOR (c:Country) ON (c.name)",
    ]
    for index in indexes:
        session.run(index)
    print("Indexes created.")


def ingest_matches(session, df):
    query = """
    UNWIND $batch AS row
    MERGE (m:Match {id: row.id})
    SET m.date = date(row.date), m.home_score = row.home_score, m.away_score = row.away_score, m.neutral = row.neutral
    MERGE (home:Team {name: row.home_team})
    MERGE (away:Team {name: row.away_team})
    MERGE (t:Tournament {name: row.tournament})
    MERGE (c:City {name: row.city})
    MERGE (country:Country {name: row.country})
    MERGE (home)-[:PLAYED_HOME]->(m)
    MERGE (away)-[:PLAYED_AWAY]->(m)
    MERGE (m)-[:PART_OF]->(t)
    MERGE (m)-[:PLAYED_IN]->(c)
    MERGE (c)-[:LOCATED_IN]->(country)
    WITH m, home, away, row.home_score AS hs, row.away_score AS as
    FOREACH(_ IN CASE WHEN hs > as THEN [1] ELSE [] END |
        MERGE (home)-[:WON]->(m)
        MERGE (away)-[:LOST]->(m)
    )
    FOREACH(_ IN CASE WHEN hs < as THEN [1] ELSE [] END |
        MERGE (away)-[:WON]->(m)
        MERGE (home)-[:LOST]->(m)
    )
    FOREACH(_ IN CASE WHEN hs = as THEN [1] ELSE [] END |
        MERGE (home)-[:DREW]->(m)
        MERGE (away)-[:DREW]->(m)
    )
    """
    batch_size = 1000
    for i in tqdm(range(0, len(df), batch_size), desc="Ingesting matches"):
        batch = df.iloc[i : i + batch_size]
        data = []
        for _, row in batch.iterrows():
            match_data = {
                "id": f"{row['date']}_{row['home_team']}_{row['away_team']}",
                "date": row["date"].strftime("%Y-%m-%d"),  # Convert date to string
                "home_score": int(row["home_score"]),
                "away_score": int(row["away_score"]),
                "neutral": bool(row["neutral"]),
                "home_team": row["home_team"],
                "away_team": row["away_team"],
                "tournament": row["tournament"],
                "city": row["city"],
                "country": row["country"],
            }
            data.append(match_data)
        session.run(query, batch=data)



logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def ingest_goals(session, df):
    query = """
    UNWIND $batch AS row
    MATCH (m:Match {id: row.id})
    MERGE (p:Player {name: row.scorer})
    MERGE (t:Team {name: row.team})
    MERGE (p)-[s:SCORED_FOR]->(t)
    SET s.own_goal = row.own_goal,
        s.penalty = row.penalty
    FOREACH(_ IN CASE WHEN row.minute IS NOT NULL THEN [1] ELSE [] END |
        SET s.minute = row.minute
    )
    MERGE (p)-[:SCORED_IN]->(m)
    """
    batch_size = 1000
    for i in tqdm(range(0, len(df), batch_size), desc="Ingesting goals"):
        batch = df.iloc[i : i + batch_size]
        data = []
        for _, row in batch.iterrows():
            try:
                goal_data = {
                    "id": f"{row['date']}_{row['home_team']}_{row['away_team']}",
                    "scorer": (
                        row["scorer"] if not pd.isna(row["scorer"]) else "Unnamed Player"
                    ),
                    "team": row["team"],
                    "minute": (
                        float(row["minute"]) if pd.notnull(row["minute"]) else None
                    ),
                    "own_goal": bool(row["own_goal"]),
                    "penalty": bool(row["penalty"]),
                }
                data.append(goal_data)
            except Exception as e:
                logger.error(f"Error processing row: {row}")
                logger.error(f"Error details: {str(e)}")

        if data:
            try:
                session.run(query, batch=data)
            except Exception as e:
                logger.error(f"Error executing batch: {str(e)}")
                logger.error(f"Problematic batch: {data}")


def ingest_shootouts(session, df):
    query = """
    UNWIND $batch AS row
    MATCH (m:Match {id: row.id})
    MATCH (w:Team {name: row.winner})
    MERGE (m)-[s:HAD_SHOOTOUT]->(w)
    SET s.winner = row.winner
    FOREACH(_ IN CASE WHEN row.first_shooter IS NOT NULL THEN [1] ELSE [] END |
        SET s.first_shooter = row.first_shooter
    )
    """
    batch_size = 1000
    for i in tqdm(range(0, len(df), batch_size), desc="Ingesting shootouts"):
        batch = df.iloc[i : i + batch_size]
        data = []
        for _, row in batch.iterrows():
            shootout_data = {
                "id": f"{row['date']}_{row['home_team']}_{row['away_team']}",
                "winner": row["winner"],
                "first_shooter": (
                    row["first_shooter"] if pd.notnull(row["first_shooter"]) else None
                ),
            }
            data.append(shootout_data)
        session.run(query, batch=data)


def main():
    with driver.session() as session:
        create_indexes(session)
        ingest_matches(session, results_df)
        ingest_goals(session, goalscorers_df)
        ingest_shootouts(session, shootouts_df)

    print("Data ingestion completed!")
    driver.close()


if __name__ == "__main__":
    main()

Indexes created.


Ingesting matches: 100%|██████████| 48/48 [03:26<00:00,  4.31s/it]
Ingesting goals: 100%|██████████| 45/45 [00:26<00:00,  1.68it/s]
Ingesting shootouts: 100%|██████████| 1/1 [00:00<00:00,  3.41it/s]

Data ingestion completed!



