In [1]:
import requests
import pandas as pd
from datetime import date

from sklearn.ensemble import RandomForestClassifier


def load_data(*args, **kwargs):
    """
    Load EPL match outcomes from Premier League API.

    """

    season = 2025
    past_years = 5
    start_matchweek = 1
    end_matchweek = 38

    all_matches = []
    all_standings = []

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }


    for year in range(past_years):
        standings_url = f"https://sdp-prem-prod.premier-league-prod.pulselive.com/api/v5/competitions/8/seasons/{season-year}/standings?live=false"

        try:
            response = requests.get(standings_url, headers=headers, timeout=10)

            if response.status_code != 200:
                print(f"Standings {year}: Error {response.status_code}")
                continue

            standings = response.json()
            season_label = standings.get("season", {}).get("id", str(year))
            for entry in standings["tables"][0].get("entries", []):
                team = entry.get("team", {}).get("name")
                points = entry.get("overall", {}).get("points")
                position = entry.get("overall", {}).get("position")
                played = entry.get("overall", {}).get("played")
                home_won = entry.get("home", {}).get("won")
                home_lost = entry.get("home", {}).get("lost")
                home_drawn = entry.get("home", {}).get("drawn")
                home_goals_for = entry.get("home", {}).get("goalsFor")
                home_goals_against = entry.get("home", {}).get("goalsAgainst")
                away_won = entry.get("away", {}).get("won")
                away_lost = entry.get("away", {}).get("lost")
                away_drawn = entry.get("away", {}).get("drawn")
                away_goals_for = entry.get("away", {}).get("goalsFor")
                away_goals_against = entry.get("away", {}).get("goalsAgainst")
                all_standings.append({
                    "season": season_label,
                    "position": position,
                    "team": team,
                    "points": points,
                    "played": played,
                    "home.won": home_won,
                    "home.lost": home_lost,
                    "home.drawn": home_drawn,
                    "home.goals_for": home_goals_for,
                    "home.goals_against": home_goals_against,
                    "away.won": away_won,
                    "away.lost": away_lost,
                    "away.drawn": away_drawn,
                    "away.goals_for": away_goals_for,
                    "away.goals_against": away_goals_against,
                })

        except Exception as e:
            print(f"Standings {year}: Error - {str(e)}")
            continue

        if season - year == season:
            season_2025_start_date = date(2025, 8, 15)
            actual_date = date(2025,10,20)
            end_matchweek = round((actual_date - season_2025_start_date).days / 7) - 1
        else:
            end_matchweek = 38

        for matchweek in range(start_matchweek, end_matchweek + 1):
            matches_url = f"https://sdp-prem-prod.premier-league-prod.pulselive.com/api/v1/competitions/8/seasons/{season-year}/matchweeks/{matchweek}/matches"

            try:
                response = requests.get(matches_url, headers=headers, timeout=10)

                if response.status_code != 200:
                    print(f"Matchweek {matchweek}: Error {response.status_code}")
                    continue

                data = response.json()
                matches_data = data.get('data', [])

                for match in matches_data:
                    try:
                        home_team = match.get('homeTeam', {})
                        away_team = match.get('awayTeam', {})

                        match_info = {
                            'match_id': match.get('matchId'),
                            'matchweek': matchweek,
                            'season': season-year,
                            'kickoff': match.get('kickoff'),
                            'kickoff_timezone': match.get('kickoffTimezone'),
                            'period': match.get('period'),
                            'competition': match.get('competition'),
                            'venue': match.get('ground'),
                            'attendance': match.get('attendance'),
                            'clock': match.get('clock'),
                            'result_type': match.get('resultType'),

                            # Home team info
                            'home_team': home_team.get('name'),
                            'home_team_id': home_team.get('id'),
                            'home_team_short': home_team.get('shortName'),
                            'home_score': home_team.get('score'),
                            'home_half_time_score': home_team.get('halfTimeScore'),
                            'home_red_cards': home_team.get('redCards'),

                            # Away team info
                            'away_team': away_team.get('name'),
                            'away_team_id': away_team.get('id'),
                            'away_team_short': away_team.get('shortName'),
                            'away_score': away_team.get('score'),
                            'away_half_time_score': away_team.get('halfTimeScore'),
                            'away_red_cards': away_team.get('redCards'),
                        }

                        # Determine match outcome
                        home_score = match_info['home_score']
                        away_score = match_info['away_score']

                        if home_score is not None and away_score is not None:
                            if home_score > away_score:
                                match_info['outcome'] = 'H'  # Home win
                                match_info['winner'] = match_info['home_team']
                            elif home_score < away_score:
                                match_info['outcome'] = 'A'  # Away win
                                match_info['winner'] = match_info['away_team']
                            else:
                                match_info['outcome'] = 'D'  # Draw
                                match_info['winner'] = 'Draw'
                            all_matches.append(match_info)

                    except Exception as e:
                        print(f"Error processing match in matchweek {matchweek}: {str(e)}")
                        continue

            except Exception as e:
                print(f"Matchweek {matchweek}: Error - {str(e)}")
                continue

    matches_df = pd.DataFrame(all_matches)

    # Convert kickoff to datetime
    if 'kickoff' in matches_df.columns and len(matches_df) > 0:
        matches_df['kickoff_datetime'] = pd.to_datetime(matches_df['kickoff'], errors='coerce')

    print(f"\n✓ Successfully loaded {len(matches_df)} matches from matchweeks {start_matchweek}-{end_matchweek}")
    if len(matches_df) > 0:
        print(f"\nSample data:")
        print(matches_df[['matchweek', 'home_team', 'away_team', 'home_score', 'away_score', 'outcome']].head(5))

    standings_df = pd.DataFrame(all_standings)

    return matches_df, standings_df



In [10]:
  m,s = load_data()


✓ Successfully loaded 1599 matches from matchweeks 1-38

Sample data:
   matchweek                 home_team         away_team  home_score  \
0          1                 Liverpool       Bournemouth           4   
1          1               Aston Villa  Newcastle United           0   
2          1  Brighton and Hove Albion            Fulham           1   
3          1         Nottingham Forest         Brentford           3   
4          1                Sunderland   West Ham United           3   

   away_score outcome  
0           2       H  
1           0       D  
2           1       D  
3           1       H  
4           0       H  


In [3]:
s

Unnamed: 0,season,position,team,points,played,home.won,home.lost,home.drawn,home.goals_for,home.goals_against,away.won,away.lost,away.drawn,away.goals_for,away.goals_against
0,2025,1,Arsenal,22,9,4,0,1,12,1,3,1,0,4,2
1,2025,2,Bournemouth,18,9,4,0,1,8,2,1,1,2,8,9
2,2025,3,Tottenham Hotspur,17,9,1,2,1,5,4,4,0,1,12,3
3,2025,4,Sunderland,17,9,3,0,1,8,2,2,2,1,3,5
4,2025,5,Manchester City,16,9,3,1,0,10,3,2,2,1,7,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2021,16,Everton,39,38,9,8,2,27,25,2,13,4,16,41
96,2021,17,Leeds United,38,38,4,9,6,19,38,5,9,5,23,41
97,2021,18,Burnley,35,38,5,8,6,18,25,2,9,8,16,28
98,2021,19,Watford,23,38,2,15,2,17,46,4,12,3,17,31


In [4]:
m

Unnamed: 0,match_id,matchweek,season,kickoff,kickoff_timezone,period,competition,venue,attendance,clock,...,home_red_cards,away_team,away_team_id,away_team_short,away_score,away_half_time_score,away_red_cards,outcome,winner,kickoff_datetime
0,2561895,1,2025,2025-08-15 20:00:00,BST,FullTime,Premier League,"Anfield, Liverpool",60315.0,97,...,0,Bournemouth,91,Bournemouth,2,0,0,H,Liverpool,2025-08-15 20:00:00
1,2561896,1,2025,2025-08-16 12:30:00,BST,FullTime,Premier League,"Villa Park, Birmingham",42526.0,97,...,1,Newcastle United,4,Newcastle,0,0,0,D,Draw,2025-08-16 12:30:00
2,2561897,1,2025,2025-08-16 15:00:00,BST,FullTime,Premier League,"American Express Stadium, Falmer",31478.0,98,...,0,Fulham,54,Fulham,1,0,0,D,Draw,2025-08-16 15:00:00
3,2561898,1,2025,2025-08-17 14:00:00,BST,FullTime,Premier League,"The City Ground, Nottingham",29949.0,96,...,0,Brentford,94,Brentford,1,0,0,H,Nottingham Forest,2025-08-17 14:00:00
4,2561899,1,2025,2025-08-16 15:00:00,BST,FullTime,Premier League,"Stadium of Light, Sunderland",46233.0,98,...,0,West Ham United,21,West Ham,0,0,0,H,Sunderland,2025-08-16 15:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,2210646,38,2021,2022-05-22 16:00:00,BST,FullTime,Premier League,"Selhurst Park, London",25434.0,97,...,0,Manchester United,1,Man Utd,0,0,0,H,Crystal Palace,2022-05-22 16:00:00
1595,2210647,38,2021,2022-05-22 16:00:00,BST,FullTime,Premier League,"King Power Stadium, Leicester",32003.0,97,...,0,Southampton,20,Southampton,1,0,0,H,Leicester City,2022-05-22 16:00:00
1596,2210648,38,2021,2022-05-22 16:00:00,BST,FullTime,Premier League,"Anfield, Liverpool",53097.0,94,...,0,Wolverhampton Wanderers,39,Wolves,1,1,0,H,Liverpool,2022-05-22 16:00:00
1597,2210649,38,2021,2022-05-22 16:00:00,BST,FullTime,Premier League,"Etihad Stadium, Manchester",53395.0,95,...,0,Aston Villa,7,Aston Villa,2,1,0,H,Manchester City,2022-05-22 16:00:00
