In [27]:
import pandas as pd
from soccerdata import Understat       
import csv
from soccerdata import ClubElo

In [None]:
import pandas as pd
from soccerdata import Understat



# 0) EPL 2015~2025 경기 로드  
LEAGUE  = ["ENG-Premier League"]
SEASONS = range(2024, 2026)

us = Understat(leagues=LEAGUE, seasons=SEASONS)
m = us.read_team_match_stats()[[
    'game_id', 'date',
    'home_team', 'away_team',
    'home_goals', 'away_goals',
    'home_xg',    'away_xg',
    'home_points','away_points'
]]
m['date'] = pd.to_datetime(m['date'])

# 1) 홈·원정 → long 포맷으로 변환
home = m.assign(
    team   = m['home_team'],
    goals  = m['home_goals'],
    GA     = m['away_goals'],        # ← 상대 득점 = 내 실점
    xg     = m['home_xg'],
    points = m['home_points'],
    side   = 'home'
)[['game_id','date','team','side','goals','GA', 'xg','points']]

away = m.assign(
    team   = m['away_team'],
    goals  = m['away_goals'],
    GA     = m['home_goals'],        # ← 상대(홈) 득점
    xg     = m['away_xg'],
    points = m['away_points'],
    side   = 'away'
)[['game_id', 'date', 'team', 'side', 'goals', 'GA', 'xg','points']]

long_df = pd.concat([home, away], ignore_index=True)



# 2) 팀별 롤링 (최근 3·5 경기) + 최신 xG 추출
def add_rolling(g):
    g = g.sort_values('date', ascending=False)
    for w in (3, 5):
        g[f'GF{w}']    = g['goals'].rolling(w, min_periods=1).sum().shift(1)    # 득점
        g[f'GA{w}']   = g['GA'].rolling(w, min_periods=1).sum().shift(1)        # 실점
        g[f'Form{w}']  = g['points'].rolling(w, min_periods=1).sum().shift(1)   # 점수
    g['rolling_xg_5'] = g['xg'].rolling(5, min_periods=1).mean()

    # 가장 최근 경기의 xg
    g['current_xg'] = g['xg']
    return g

long_roll = (
    long_df
    .groupby('team', group_keys=False)
    .apply(add_rolling)
)

# 3) 다시 홈·원정 wide 포맷
cols_keep = ['game_id','date']
home_cols = ['rolling_xg_5','Form3','Form5','GF3','GF5','GA3','GA5', 'current_xg']
away_cols = home_cols            # 이름은 동일, 접두사 달라짐

home_w = (
    long_roll[long_roll['side'] == 'home']
    .loc[:, cols_keep + ['team'] + home_cols]
    .rename(columns={
        'team': 'HomeTeam',                          # 팀 이름 복사
        **{c: f'home_{c}' for c in home_cols}
    })
)

away_w = (
    long_roll[long_roll['side'] == 'away']
    .loc[:, cols_keep + ['team'] + away_cols]
    .rename(columns={
        'team': 'AwayTeam',
        **{c: f'away_{c}' for c in away_cols}
    })
)

feat_df = home_w.merge(away_w, on=['game_id', 'date'], how='inner')



# 4) 컬럼 이름 일괄 매핑
rename_map = {}

for base in ['Form3','Form5','GF3','GF5','GA3','GA5']:   
    rename_map[f'home_{base}'] = f'{base}Home'
    rename_map[f'away_{base}'] = f'{base}Away'

rename_map.update({
    'home_rolling_xg_5': 'rolling_xg_home_5',
    'away_rolling_xg_5': 'rolling_xg_away_5',
    'home_current_xg'  : 'h_xg',
    'away_current_xg'  : 'a_xg'
})

feat_df = feat_df.rename(columns=rename_map)

# 원하는 순서로 컬럼 재배치
cols = [
    'game_id','date','HomeTeam','AwayTeam',         
    'rolling_xg_home_5','rolling_xg_away_5',
    'Form3Home','Form5Home','Form3Away','Form5Away',
    'GF3Home','GF5Home','GF3Away','GF5Away',
    'GA3Home','GA5Home','GA3Away','GA5Away',
    'h_xg','a_xg'
]

# 컬럼 재배치
feat_df = feat_df[cols]

# 결측치 처리
feat_df.fillna(0)


In [36]:
feat_df.head()

Unnamed: 0,game_id,date,HomeTeam,AwayTeam,rolling_xg_home_5,rolling_xg_away_5,Form3Home,Form5Home,Form3Away,Form5Away,GF3Home,GF5Home,GF3Away,GF5Away,GA3Home,GA5Home,GA3Away,GA5Away,h_xg,a_xg
0,26962,2025-05-18 15:30:00,Arsenal,Newcastle United,1.783345,1.70272,3.0,3.0,0.0,0.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.15958,2.24634
1,26945,2025-05-03 16:30:00,Arsenal,Bournemouth,1.95237,1.531763,7.0,7.0,3.0,3.0,5.0,5.0,3.0,3.0,3.0,3.0,4.0,4.0,2.05484,1.41783
2,26933,2025-04-23 19:00:00,Arsenal,Crystal Palace,1.946194,2.385978,4.0,7.0,7.0,8.0,4.0,6.0,7.0,8.0,4.0,5.0,3.0,4.0,1.92149,2.50515
3,26913,2025-04-12 16:30:00,Arsenal,Brentford,1.949054,1.994485,4.0,8.0,9.0,12.0,7.0,10.0,10.0,13.0,4.0,6.0,5.0,8.0,0.767369,0.588583
4,26892,2025-04-01 19:45:00,Arsenal,Fulham,1.841494,1.30869,5.0,6.0,3.0,6.0,6.0,9.0,4.0,6.0,2.0,6.0,5.0,7.0,2.30189,1.44206


In [None]:


def get_presume_data(home_team: str, 
                    away_team: str, 
                    df: pd.DataFrame = feat_df
                    ) -> pd.DataFrame:
    """
    ▶ home_team, away_team 을 입력받아
      feat_df 와 동일한 구조·컬럼의 1행 DataFrame 을 반환한다.
    """
    # ── 1) 각 팀의 ‘가장 최근 경기’ 행 찾기 ─────────────────
    home_row = (
        df.query("HomeTeam == @home_team")
          .sort_values('date', ascending=False)
          .head(1)
    )
    away_row = (
        df.query("AwayTeam == @away_team")
          .sort_values('date', ascending=False)
          .head(1)
    )
    
    if home_row.empty or away_row.empty:
        raise ValueError("입력 팀의 최근 경기 행을 찾을 수 없습니다.")

    # ── 2) 홈/원정용 컬럼 추출 ───────────────────────────────
    # home-prefixed & away-prefixed 컬럼 자동 식별
    home_cols = [c for c in df.columns if c.endswith('Home')] + ['h_xg', 'rolling_xg_home_5']
    away_cols = [c for c in df.columns if c.endswith('Away')] + ['a_xg', 'rolling_xg_away_5']

    # 새 행에 복사
    new_row = pd.DataFrame(columns=df.columns, index=[0])

    # 식별 컬럼
    new_row.at[0, 'game_id']  = -1          # dummy
    new_row.at[0, 'date']     = pd.Timestamp.now().normalize()
    new_row.at[0, 'HomeTeam'] = home_team
    new_row.at[0, 'AwayTeam'] = away_team

    # 홈·원정 메트릭
    new_row.loc[0, home_cols] = home_row.iloc[0][home_cols]
    new_row.loc[0, away_cols] = away_row.iloc[0][away_cols]

    # ── 3) Elo 삽입 & elo_diff 계산 ─────────────────────────
    new_row.at[0, 'HomeElo']  = home_row.iloc[0]['HomeElo']
    new_row.at[0, 'AwayElo']  = away_row.iloc[0]['AwayElo']
    new_row.at[0, 'elo_diff'] = new_row.at[0, 'HomeElo'] - new_row.at[0, 'AwayElo']

    # 결측치가 남아 있으면 0 으로
    new_row.fillna(0, inplace=True)
    return new_row