In [1]:
import pandas as pd

# 1) CSV 불러오기 & match_id 생성
df = pd.read_csv('dataset/Club Football Match Data (2000-2025)/Matches.csv', parse_dates=['MatchDate'])
df = df.reset_index().rename(columns={'index':'match_id'})

# 2) 홈/원정 각각 long 포맷으로 전환
home = df[['match_id','MatchDate','HomeTeam','FTHome','FTAway']].copy()
home = home.assign(
    team           = home['HomeTeam'],
    goals_for      = home['FTHome'],
    goals_against  = home['FTAway'],
    venue          = 'Home'
)[['match_id','MatchDate','team','goals_for','goals_against','venue']]

away = df[['match_id','MatchDate','AwayTeam','FTAway','FTHome']].copy()
away = away.assign(
    team           = away['AwayTeam'],
    goals_for      = away['FTAway'],
    goals_against  = away['FTHome'],
    venue          = 'Away'
)[['match_id','MatchDate','team','goals_for','goals_against','venue']]

matches_long = pd.concat([home, away], ignore_index=True)

# 3) 정렬하고 인덱스 재설정 (꼭 필요)
matches_long = matches_long.sort_values(['team','MatchDate']).reset_index(drop=True)

# 4) 과거 3·5경기 득실 합계 계산 (transform 이용)
for N in (3, 5):
    # 먼저 “현재 경기” 제외를 위해 shift()
    shifted_gf = matches_long.groupby('team')['goals_for']     .shift()
    shifted_ga = matches_long.groupby('team')['goals_against'] .shift()

    # rolling 합계 계산
    matches_long[f'GF{N}'] = (shifted_gf
                              .groupby(matches_long['team'])
                              .transform(lambda x: x.rolling(N).sum()))
    matches_long[f'GA{N}'] = (shifted_ga
                              .groupby(matches_long['team'])
                              .transform(lambda x: x.rolling(N).sum()))

# 5) 홈/Away별로 다시 뽑아서 이름 바꾸기
home_stats = (
    matches_long[matches_long['venue']=='Home']
    .set_index('match_id')[['GF3','GA3','GF5','GA5']]
    .rename(columns={
        'GF3':'GF3Home','GA3':'GA3Home',
        'GF5':'GF5Home','GA5':'GA5Home'
    })
)
away_stats = (
    matches_long[matches_long['venue']=='Away']
    .set_index('match_id')[['GF3','GA3','GF5','GA5']]
    .rename(columns={
        'GF3':'GF3Away','GA3':'GA3Away',
        'GF5':'GF5Away','GA5':'GA5Away'
    })
)

# 6) map으로 원본 df에 컬럼 추가
for col in home_stats.columns:
    df[col] = df['match_id'].map(home_stats[col])
for col in away_stats.columns:
    df[col] = df['match_id'].map(away_stats[col])

# 7) 불필요해진 match_id 제거 (선택)
df = df.drop(columns=['match_id'])

# 8) 결과 확인
print(df[[
    'MatchDate','HomeTeam','AwayTeam',
    'GF3Home','GA3Home','GF3Away','GA3Away',
    'GF5Home','GA5Home','GF5Away','GA5Away'
]].tail())


  df = pd.read_csv('dataset/Club Football Match Data (2000-2025)/Matches.csv', parse_dates=['MatchDate'])


        MatchDate   HomeTeam  AwayTeam  GF3Home  GA3Home  GF3Away  GA3Away  \
230552 2025-06-01  Cartagena  Mirandes      5.0      6.0      5.0      3.0   
230553 2025-06-01    Almeria  Tenerife      3.0      2.0      0.0      2.0   
230554 2025-06-01  La Coruna     Elche      3.0      6.0      4.0      5.0   
230555 2025-06-01     Oviedo     Cadiz      3.0      1.0      6.0      2.0   
230556 2025-06-01    Cordoba  Albacete      4.0      9.0      5.0      3.0   

        GF5Home  GA5Home  GF5Away  GA5Away  
230552      7.0      9.0      8.0      4.0  
230553     10.0      3.0      2.0      4.0  
230554      9.0      9.0      6.0      6.0  
230555      6.0      2.0      9.0      6.0  
230556      9.0     12.0      9.0      9.0  


In [2]:
epl_team = df.copy()

epl_team = epl_team[epl_team['Division'] == 'E0']

In [3]:
epl_team.tail()

Unnamed: 0,Division,MatchDate,MatchTime,HomeTeam,AwayTeam,HomeElo,AwayElo,Form3Home,Form5Home,Form3Away,...,C_HTB,C_PHB,GF3Home,GA3Home,GF5Home,GA5Home,GF3Away,GA3Away,GF5Away,GA5Away
230506,E0,2025-05-25,16:00:00,Newcastle,Everton,1882.33,1776.2,4.0,7.0,7.0,...,0.5967,0.015,3.0,2.0,7.0,6.0,7.0,3.0,7.0,6.0
230507,E0,2025-05-25,16:00:00,Nott'm Forest,Chelsea,1799.76,1880.42,5.0,8.0,6.0,...,0.0291,0.0305,5.0,4.0,7.0,7.0,4.0,3.0,7.0,4.0
230508,E0,2025-05-25,16:00:00,Southampton,Arsenal,1557.66,1985.45,1.0,2.0,4.0,...,0.0276,0.0098,0.0,4.0,2.0,7.0,4.0,4.0,10.0,6.0
230509,E0,2025-05-25,16:00:00,Tottenham,Brighton,1784.55,1797.83,1.0,1.0,7.0,...,0.19,0.0425,1.0,5.0,3.0,12.0,6.0,3.0,11.0,9.0
230510,E0,2025-05-25,16:00:00,Wolves,Brentford,1739.45,1820.23,0.0,6.0,6.0,...,0.6641,0.0131,2.0,7.0,6.0,7.0,7.0,6.0,13.0,8.0


### 실제로 맞는 값인지 확인하기 위해 soccerdata 활용해서 확인
- 여기선 GF3Home : 지난 3경기에서 홈팀이 넣은 골을 확인해 보겠다.
- Wolves의 지난 경기결과를 가져와서 실제 값과 비교 -> 맞다.

In [4]:
import pandas as pd
import soccerdata as sd

# ────────────────────────────────────────────────────────────────────────────
# 1) FBref 인스턴스 생성
fbref = sd.FBref(
    leagues="ENG-Premier League",
    seasons="2425",
    no_cache=True
)

# ────────────────────────────────────────────────────────────────────────────
# 2) 함수 정의: 마지막 n경기 득실 정보 반환
def get_last_n_matches_goal_diff(fbref, team_name, n):
    # 전체 일정 & 인덱스 해제
    sched = fbref.read_schedule().reset_index()

    # 팀명 소문자 기준 비교를 위한 준비
    team_lower = team_name.lower()

    # 해당 팀 경기만 필터링
    mask = sched["home_team"].str.lower().str.contains(team_lower, na=False) \
         | sched["away_team"].str.lower().str.contains(team_lower, na=False)
    tm = sched.loc[mask].copy()

    # 날짜 파싱·정렬
    tm["date"] = pd.to_datetime(tm["date"])
    tm = tm.sort_values("date")

    # ‘–’을 일반 대시로 통일
    tm["score"] = tm["score"].str.replace("–", "-", regex=False)

    # 유효 점수 패턴만
    tm = tm[tm["score"].str.contains(r"\d+\s*-\s*\d+", na=False)].copy()

    # 홈·어웨이 득점 분리
    goals = tm["score"].str.extract(r"(\d+)\s*-\s*(\d+)")
    tm["home_goals"] = pd.to_numeric(goals[0], errors="coerce")
    tm["away_goals"] = pd.to_numeric(goals[1], errors="coerce")

    # 마지막 n경기
    last_n = tm.tail(n)
    if last_n.empty:
        print("⚠️ 마지막 n경기에 해당하는 데이터가 없습니다.")
        return pd.DataFrame()

    # 결과 수집
    records = []
    for _, row in last_n.iterrows():
        # 홈팀인지 확인할 때도 파이썬 문자열 비교
        if team_lower in row["home_team"].lower():
            gf, ga, venue = row["home_goals"], row["away_goals"], "Home"
            opp = row["away_team"]
        else:
            gf, ga, venue = row["away_goals"], row["home_goals"], "Away"
            opp = row["home_team"]

        records.append({
            "date": row["date"].date(),
            "opponent": opp,
            "venue": venue,
            "goals_for": int(gf),
            "goals_against": int(ga),
            "goal_diff": int(gf - ga)
        })

    return pd.DataFrame(records)

# ────────────────────────────────────────────────────────────────────────────
# 3) 사용 예: 마지막 3경기 정보 가져오기
team_name = "Wolves"
n = 4
df_last3 = get_last_n_matches_goal_diff(fbref, team_name, n)
print(df_last3)


         date         opponent venue  goals_for  goals_against  goal_diff
0  2025-05-02  Manchester City  Away          0              1         -1
1  2025-05-10         Brighton  Home          0              2         -2
2  2025-05-20   Crystal Palace  Away          2              4         -2
3  2025-05-25        Brentford  Home          1              1          0
