In [None]:
from data_loader import *
from utils import *

from concurrent.futures import ThreadPoolExecutor,as_completed
from itertools import product

In [None]:
leagues_dat = get_leagues(home_dir +"/data/Leagues/leagues.parquet")
leagues_dat[['league_id','league_name','country_name']].head()

In [None]:
teams_data = pd.read_parquet(home_dir + "/data/Teams/team_league.parquet")
teams_data = teams_data.merge(leagues_dat,left_on='league',right_on="league_id",how = "left")
teams_data.head()


In [None]:
teams_data[teams_data.league_name.str.contains("Premier League")]

In [None]:
# Pr
read_all_teams = False
unique_teams = teams_data[teams_data.league_id == 39]['team_name'].unique()

if read_all_teams:
    # All teams:
    unique_teams = teams_data['team_name'].unique()

print(f"Number of Teams to pull: {len(unique_teams)} ")
# Specific teams:
print("Expected API requests: ",int(len(unique_teams) * 38 * .8))

In [None]:
teams = ['Wolves','Brighton'] #unique_teams
season = [2025]
team_season_pair = list(product(teams,season))
for team,season in team_season_pair:
    print(team, season)

In [None]:
parallel = False
workers = 3
visited_list = list()
error_list = list()

if parallel:
    print(f"Running in parallel with {workers} workers")
    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = {executor.submit(read_fixtures_for_season,t,s) : (t,s) for t,s in team_season_pair}
        
else:
    for team,season in team_season_pair:
        print("Running Serially")
        try:
            dat = read_fixtures_for_season(team,season,sleep_time=20)
            visited_list.append((team,season))
        except:
            print(f"{team} processing encountered errors")
            error_list.append((team,season))

In [None]:
visited_list

In [None]:
dat = read_fixtures_for_season(team,season,sleep_time=20)

# Debug

In [None]:
teams = list(set(teams) - set(visited_list))
season = [2025]
team_season_pair = list(product(teams,season))

team_season_pair



In [None]:
# Debugging 
def read_fixtures_for_season(team,season,sleep_time = 20):
    """
    team: Name of team
    season: int, year of the season start
    Returns data for the season for the specified team with some engineered features
    """
    print(f"processing for {team}, {season}")
    fixtures = get_team_fixtures(team,season)
    
    home_fixtures = list(fixtures[fixtures.teams_home_name == team]['fixture_id'])
    away_fixtures = list(fixtures[fixtures.teams_away_name == team]['fixture_id'])

    all_fixtures_data = []

    for fixture in home_fixtures + away_fixtures:
        player_stat_url = "https://v3.football.api-sports.io/fixtures/players?fixture={}".format(fixture)
        fixture_response = requests.get(player_stat_url,headers=headers_api_sport)
        time.sleep(sleep_time)

        response_json = fixture_response.json()

        if not response_json.get('response') or not isinstance(response_json['response'], list) or len(response_json['response']) == 0:
            print(f"Warning: Invalid or empty response for fixture {fixture}. Skipping.")
            continue
            
        response_data = response_json['response']
        
        team_index = 0 if fixture in home_fixtures else 1
        
        if len(response_data) <= team_index or 'players' not in response_data[team_index] or not response_data[team_index]['players']:
            print(f"Warning: No player data for team index {team_index} in fixture {fixture}. Skipping.")
            continue
            
        team_player_data = response_data[team_index]['players']
        
        player_details = []
        for p in team_player_data:
            details = {'player.id': p['player']['id'], 'player.name': p['player']['name']}
            if p.get('statistics') and p['statistics']:
                details.update(p['statistics'][0])
            player_details.append(details)
            
        if not player_details:
            print(f"Warning: No player details extracted for fixture {fixture}. Skipping.")
            continue
            
        fixture_dat_expanded = pd.DataFrame(player_details)

        fixture_dat_expanded['fixture_id'] = fixture
        if fixture in home_fixtures:
            fixture_dat_expanded['team_goals_scored'] = fixtures[(fixtures.fixture_id == fixture)]['goals_home'].values[0] 
            fixture_dat_expanded['team_non_penalty_goals_scored'] = fixtures[(fixtures.fixture_id == fixture)]['goals_home'].values[0] - fixtures[(fixtures.fixture_id == fixture)]['score_penalty_home'].fillna(0).values[0]
            fixture_dat_expanded['team_goals_scored_half'] = fixtures[(fixtures.fixture_id == fixture)]['score_halftime_home'].values[0] 
            fixture_dat_expanded['team_goals_conceded'] = fixtures[(fixtures.fixture_id == fixture)]['goals_away'].values[0] 
            fixture_dat_expanded['team_non_penalty_goals_conceded'] = fixtures[(fixtures.fixture_id == fixture)]['goals_away'].values[0] - fixtures[(fixtures.fixture_id == fixture)]['score_penalty_away'].fillna(0).values[0]
            fixture_dat_expanded['team_goals_conceded_half'] = fixtures[(fixtures.fixture_id == fixture)]['score_halftime_away'].values[0]             
            fixture_dat_expanded['opponent'] = fixtures[(fixtures.fixture_id == fixture)]['teams_away_name'].values[0]             
        else: # away
            fixture_dat_expanded['team_goals_scored'] = fixtures[(fixtures.fixture_id == fixture)]['goals_away'].values[0] 
            fixture_dat_expanded['team_non_penalty_goals_scored'] = fixtures[(fixtures.fixture_id == fixture)]['goals_away'].values[0] - fixtures[(fixtures.fixture_id == fixture)]['score_penalty_away'].fillna(0).values[0]
            fixture_dat_expanded['team_goals_scored_half'] = fixtures[(fixtures.fixture_id == fixture)]['score_halftime_away'].values[0] 
            fixture_dat_expanded['team_goals_conceded'] = fixtures[(fixtures.fixture_id == fixture)]['goals_home'].values[0] 
            fixture_dat_expanded['team_non_penalty_goals_conceded'] = fixtures[(fixtures.fixture_id == fixture)]['goals_home'].values[0] - fixtures[(fixtures.fixture_id == fixture)]['score_penalty_home'].fillna(0).values[0]
            fixture_dat_expanded['team_goals_conceded_half'] = fixtures[(fixtures.fixture_id == fixture)]['score_halftime_home'].values[0] 
            fixture_dat_expanded['opponent'] = fixtures[(fixtures.fixture_id == fixture)]['teams_home_name'].values[0]   

        # adding team winner
        fixture_dat_expanded['fixture_date'] = fixtures[(fixtures.fixture_id == fixture)]['fixture_date'].values[0] 
        fixture_dat_expanded['team_winner'] = str(fixtures[(fixtures.fixture_id == fixture)]['winner'].values[0])
        fixture_dat_expanded['team'] = team
        all_fixtures_data.append(fixture_dat_expanded)

    if not all_fixtures_data:
        fixtures_dat = pd.DataFrame()
    else:
        fixtures_dat = pd.concat(all_fixtures_data, axis=0)
    
    fixtures_dat = lower_columns(fixtures_dat)

    print(fixtures_dat.columns)

    if not fixtures_dat.empty:
        # Outcome
        fixtures_dat['outcome'] = np.where(fixtures_dat.team == fixtures_dat.team_winner,'win',np.where(fixtures_dat.team_winner == 'Draw','draw','loss'))

        # feature engineering
        fixtures_dat['dribble_success_rate'] = (fixtures_dat.dribbles_success.astype("float64")/fixtures_dat.dribbles_attempts.astype("float64").replace(0, np.nan)) * 100
        fixtures_dat['target_shot_conversion_perc'] = (fixtures_dat.goals_total.astype("float64")/fixtures_dat.shots_on.astype("float64").replace(0, np.nan)) * 100
        fixtures_dat['duels_won_perc'] = (fixtures_dat.duels_won.astype("float64")/fixtures_dat.duels_total.astype("float64").replace(0, np.nan)) * 100
        fixtures_dat['pass_accuracy_perc'] = (fixtures_dat.passes_accuracy.astype("float64")/ fixtures_dat.passes_total.astype("float64").replace(0, np.nan)) * 100

    fixtures_dat.to_parquet(home_dir+f"/data/Fixtures/{team.replace(' ','_')}_{str(season)}.parquet")

    return fixtures_dat




In [None]:
team_season_pair[0:3]

In [None]:
team = 'Chelsea'
season = 2025

In [None]:
print(f"processing for {team}, {season}")
fixtures = get_team_fixtures(team,season)

home_fixtures = list(fixtures[fixtures.teams_home_name == team]['fixture_id'])
away_fixtures = list(fixtures[fixtures.teams_away_name == team]['fixture_id'])

In [None]:
home_fixtures

In [None]:
test_fixture = 1378976
sleep_time = 5

In [None]:

player_stat_url = "https://v3.football.api-sports.io/fixtures/players?fixture={}".format(test_fixture)
fixture_response = requests.get(player_stat_url,headers=headers_api_sport)
time.sleep(sleep_time)

response_json = fixture_response.json()

if not response_json.get('response') or not isinstance(response_json['response'], list) or len(response_json['response']) == 0:
    print(f"Warning: Invalid or empty response for fixture {fixture}. Skipping.")
    
    
response_data = response_json['response']

team_index = 0 if test_fixture in home_fixtures else 1

if len(response_data) <= team_index or 'players' not in response_data[team_index] or not response_data[team_index]['players']:
    print(f"Warning: No player data for team index {team_index} in fixture {fixture}. Skipping.")
    
    
team_player_data = response_data[team_index]['players']

In [None]:
team_player_data

In [None]:
player_details = []
for p in team_player_data:
    details = {'player.id': p['player']['id'], 'player.name': p['player']['name']}
    if p.get('statistics') and p['statistics']:
        details.update(p['statistics'][0])
    player_details.append(details)
    
if not player_details:
    print(f"Warning: No player details extracted for fixture {fixture}. Skipping.")
    

In [None]:
player_test = pd.DataFrame(player_details)

In [None]:
fixture_dat_expanded = pd.DataFrame(player_details)

fixture_dat_expanded['fixture_id'] = test_fixture
if test_fixture in home_fixtures:
    fixture_dat_expanded['team_goals_scored'] = fixtures[(fixtures.fixture_id == test_fixture)]['goals_home'].values[0] 
    fixture_dat_expanded['team_non_penalty_goals_scored'] = fixtures[(fixtures.fixture_id == test_fixture)]['goals_home'].values[0] - fixtures[(fixtures.fixture_id == test_fixture)]['score_penalty_home'].fillna(0).values[0]
    fixture_dat_expanded['team_goals_scored_half'] = fixtures[(fixtures.fixture_id == test_fixture)]['score_halftime_home'].values[0] 
    fixture_dat_expanded['team_goals_conceded'] = fixtures[(fixtures.fixture_id == test_fixture)]['goals_away'].values[0] 
    fixture_dat_expanded['team_non_penalty_goals_conceded'] = fixtures[(fixtures.fixture_id == test_fixture)]['goals_away'].values[0] - fixtures[(fixtures.fixture_id == test_fixture)]['score_penalty_away'].fillna(0).values[0]
    fixture_dat_expanded['team_goals_conceded_half'] = fixtures[(fixtures.fixture_id == test_fixture)]['score_halftime_away'].values[0]             
    fixture_dat_expanded['opponent'] = fixtures[(fixtures.fixture_id == test_fixture)]['teams_away_name'].values[0]             
else: # away
    fixture_dat_expanded['team_goals_scored'] = fixtures[(fixtures.fixture_id == test_fixture)]['goals_away'].values[0] 
    fixture_dat_expanded['team_non_penalty_goals_scored'] = fixtures[(fixtures.fixture_id == test_fixture)]['goals_away'].values[0] - fixtures[(fixtures.fixture_id == test_fixture)]['score_penalty_away'].fillna(0).values[0]
    fixture_dat_expanded['team_goals_scored_half'] = fixtures[(fixtures.fixture_id == test_fixture)]['score_halftime_away'].values[0] 
    fixture_dat_expanded['team_goals_conceded'] = fixtures[(fixtures.fixture_id == test_fixture)]['goals_home'].values[0] 
    fixture_dat_expanded['team_non_penalty_goals_conceded'] = fixtures[(fixtures.fixture_id == test_fixture)]['goals_home'].values[0] - fixtures[(fixtures.fixture_id == test_fixture)]['score_penalty_home'].fillna(0).values[0]
    fixture_dat_expanded['team_goals_conceded_half'] = fixtures[(fixtures.fixture_id == test_fixture)]['score_halftime_home'].values[0] 
    fixture_dat_expanded['opponent'] = fixtures[(fixtures.fixture_id == test_fixture)]['teams_home_name'].values[0]   

# adding team winner
fixture_dat_expanded['fixture_date'] = fixtures[(fixtures.fixture_id == test_fixture)]['fixture_date'].values[0] 
fixture_dat_expanded['team_winner'] = str(fixtures[(fixtures.fixture_id == test_fixture)]['winner'].values[0])
fixture_dat_expanded['team'] = team
all_fixtures_data.append(fixture_dat_expanded)

In [None]:
fixture_dat_expanded

In [None]:
if not all_fixtures_data:
    fixtures_dat = pd.DataFrame()
else:
    fixtures_dat = pd.concat(all_fixtures_data, axis=0)

fixtures_dat = lower_columns(fixtures_dat)

print(fixtures_dat.columns)

In [None]:
fixtures_dat.head()

In [None]:

fixtures_dat.columns

In [None]:
set(complete_data.columns).difference((fixtures_dat.columns))

In [None]:
fixtures_dat.shots.apply(pd.Series)

In [None]:
if not fixtures_dat.empty:
    # Outcome
    fixtures_dat['outcome'] = np.where(fixtures_dat.team == fixtures_dat.team_winner,'win',np.where(fixtures_dat.team_winner == 'Draw','draw','loss'))

    # feature engineering
    fixtures_dat['dribble_success_rate'] = (fixtures_dat.dribbles_success.astype("float64")/fixtures_dat.dribbles_attempts.astype("float64").replace(0, np.nan)) * 100
    fixtures_dat['target_shot_conversion_perc'] = (fixtures_dat.goals_total.astype("float64")/fixtures_dat.shots_on.astype("float64").replace(0, np.nan)) * 100
    fixtures_dat['duels_won_perc'] = (fixtures_dat.duels_won.astype("float64")/fixtures_dat.duels_total.astype("float64").replace(0, np.nan)) * 100
    fixtures_dat['pass_accuracy_perc'] = (fixtures_dat.passes_accuracy.astype("float64")/ fixtures_dat.passes_total.astype("float64").replace(0, np.nan)) * 100


In [None]:
complete_data = pd.read_sql("select * from overperformxg.complete_data",config['MYSQL_STRING'])

In [None]:
complete_data.columns

In [None]:
fixtures_dat.head()