In [1]:
from pathlib import Path
import pandas as pd

# -------------------------
# Paths
# -------------------------
project_root = Path.cwd().parent          # notebooks/ -> project root
processed_dir = project_root / "data_processed"

files = {
    "21-22": "pl_matchlist_21-22_v2.csv",
    "22-23": "pl_matchlist_22-23_v2.csv",
    "23-24": "pl_matchlist_23-24_v2.csv",
    "24-25": "pl_matchlist_24-25_v2.csv",
}

# -------------------------
# 1) Load & collect team names
# -------------------------
dfs = {}
teams_by_season = {}

for season, fname in files.items():
    path = processed_dir / fname
    df = pd.read_csv(path)
    dfs[season] = df

    teams = sorted(df["Team"].dropna().unique())
    teams_by_season[season] = teams
    print(season, "teams:", len(teams))

# Quick look (optional)
teams_by_season["21-22"][:10]


21-22 teams: 20
22-23 teams: 20
23-24 teams: 20
24-25 teams: 20


['Arsenal',
 'Aston Villa',
 'Brentford',
 'Brighton & Hove Albion',
 'Burnley',
 'Chelsea',
 'Crystal Palace',
 'Everton',
 'Leeds United',
 'Leicester City']

In [2]:
sets = {s: set(t) for s, t in teams_by_season.items()}
all_teams = set().union(*sets.values())

# Teams missing per season (helps spot naming issues vs promoted/relegated)
missing = {s: sorted(all_teams - sets[s]) for s in sets}
extra = {s: sorted(sets[s] - (all_teams - (all_teams - sets[s]))) for s in sets}  # basically same set

# Print a compact summary
for s in sets:
    print(f"\nSeason {s}")
    print("Unique teams:", len(sets[s]))
    print("Missing vs union:", len(missing[s]))
    if len(missing[s]) <= 10:
        print("Missing:", missing[s])



Season 21-22
Unique teams: 20
Missing vs union: 6
Missing: ['AFC Bournemouth', 'Fulham', 'Ipswich Town', 'Luton Town', 'Nottingham Forest', 'Sheffield United']

Season 22-23
Unique teams: 20
Missing vs union: 6
Missing: ['Burnley', 'Ipswich Town', 'Luton Town', 'Norwich City', 'Sheffield United', 'Watford']

Season 23-24
Unique teams: 20
Missing vs union: 6
Missing: ['Ipswich Town', 'Leeds United', 'Leicester City', 'Norwich City', 'Southampton', 'Watford']

Season 24-25
Unique teams: 20
Missing vs union: 6
Missing: ['Burnley', 'Leeds United', 'Luton Town', 'Norwich City', 'Sheffield United', 'Watford']


In [3]:
import difflib

# Compare each season against the union to find close matches
def find_close(name, choices, n=5, cutoff=0.85):
    return difflib.get_close_matches(name, choices, n=n, cutoff=cutoff)

union_list = sorted(all_teams)

suspects = []
for s, team_list in teams_by_season.items():
    for t in team_list:
        matches = find_close(t, union_list, n=5, cutoff=0.90)
        # if there are multiple very close matches, it may indicate aliasing
        if len(matches) > 1:
            suspects.append((s, t, matches))

# Show a few (if any)
suspects[:20]


[]