### 승부예측을 위해 데이터를 수집하는 코드
- 3개의 파라미터가 필요 (EPL) 
    1. 경기일자
    2. 팀1
    3. 팀2

In [4]:
import pandas as pd
import soccerdata as sd

class DataCollector: 
    def __init__(self):
        self.fbref = sd.FBref(
            leagues="ENG-Premier League",
            seasons="2425",  # 2024-2025 시즌
            no_cache=True
        )
        self.clubelo = sd.ClubElo()
        self.team_name_map = {
            "Arsenal": "Arsenal",
            "Aston Villa": "Aston Villa",
            "Bournemouth": "Bournemouth",
            "Brentford": "Brentford",
            "Brighton": "Brighton",
            "Chelsea": "Chelsea",
            "Crystal Palace": "Crystal Palace",
            "Everton": "Everton",
            "Fulham": "Fulham",
            "Leeds United": "Leeds United",
            "Liverpool": "Liverpool",
            "Man City": "Man City",
            "Man Utd": "Man Utd",
            "Newcastle": "Newcastle",
            "Nottingham Forest": "Nottingham Forest",
            "Southampton": "Southampton",
            "Spurs": "Spurs",
            "West Ham": "West Ham",
            "Wolves": "Wolves"
        }

    def get_team_elo(self, team_alias, match_date):
        official_name = self.team_name_map.get(team_alias, team_alias)
        match_date = pd.to_datetime(match_date)
        day_before = match_date - pd.Timedelta(days=1)

        try:
            df = self.clubelo.read_by_date(date=day_before).reset_index()
        except Exception as e:
            print(f"Elo 데이터 로딩 실패: {e}")
            return None

        filtered = df[df["team"].str.lower() == official_name.lower()]
        if filtered.empty:
            print(f"Elo 데이터 없음: {official_name} on {day_before.date()}")
            return None
        return filtered.iloc[0]["elo"]

    def get_last_n_matches_goal_diff(self, team_name, n):
        sched = self.fbref.read_schedule().reset_index()
        team_lower = team_name.lower()

        mask = sched["home_team"].str.lower().str.contains(team_lower, na=False) \
            | sched["away_team"].str.lower().str.contains(team_lower, na=False)
        tm = sched.loc[mask].copy()

        tm["date"] = pd.to_datetime(tm["date"])
        tm = tm.sort_values("date")
        tm["score"] = tm["score"].str.replace("–", "-", regex=False)
        tm = tm[tm["score"].str.contains(r"\d+\s*-\s*\d+", na=False)].copy()

        goals = tm["score"].str.extract(r"(\d+)\s*-\s*(\d+)")
        tm["home_goals"] = pd.to_numeric(goals[0], errors="coerce")
        tm["away_goals"] = pd.to_numeric(goals[1], errors="coerce")

        last_n = tm.tail(n)
        if last_n.empty:
            return pd.DataFrame()

        records = []
        for _, row in last_n.iterrows():
            if team_lower in row["home_team"].lower():
                gf, ga = row["home_goals"], row["away_goals"]
                opp = row["away_team"]
            else:
                gf, ga = row["away_goals"], row["home_goals"]
                opp = row["home_team"]

            records.append({
                "date": row["date"].date(),
                "opponent": opp,
                "goals_for": int(gf),
                "goals_against": int(ga),
                "goal_diff": int(gf - ga)
            })

        return pd.DataFrame(records)

    def collect_features(self, match_date, home_team, away_team):
        match_date = pd.to_datetime(match_date)

        def get_stats(team, n):
            df = self.get_last_n_matches_goal_diff(team, 20)
            df = df[df["date"] < match_date.date()]
            df = df.sort_values("date").tail(n)
            gf = df["goals_for"].sum()
            ga = df["goals_against"].sum()
            form = df["goal_diff"].apply(lambda x: 3 if x > 0 else 1 if x == 0 else 0).sum()
            return int(gf), int(ga), int(form)

        gf3h, ga3h, form3h = get_stats(home_team, 3)
        gf5h, ga5h, form5h = get_stats(home_team, 5)
        gf3a, ga3a, form3a = get_stats(away_team, 3)
        gf5a, ga5a, form5a = get_stats(away_team, 5)

        elo_home = self.get_team_elo(home_team, match_date)
        elo_away = self.get_team_elo(away_team, match_date)

        data = {
            "HomeTeam": home_team,
            "AwayTeam": away_team,
            "GF3Home": gf3h,
            "GA3Home": ga3h,
            "GF5Home": gf5h,
            "GA5Home": ga5h,
            "GF3Away": gf3a,
            "GA3Away": ga3a,
            "GF5Away": gf5a,
            "GA5Away": ga5a,
            "Form3Home": form3h,
            "Form5Home": form5h,
            "Form3Away": form3a,
            "Form5Away": form5a,
            "HomeElo": elo_home,
            "AwayElo": elo_away
        }

        return pd.DataFrame([data])


In [5]:
co = DataCollector()

df = co.collect_features("2025-05-25", "Wolves", "Brentford")
df

Unnamed: 0,HomeTeam,AwayTeam,GF3Home,GA3Home,GF5Home,GA5Home,GF3Away,GA3Away,GF5Away,GA5Away,Form3Home,Form5Home,Form3Away,Form5Away,HomeElo,AwayElo
0,Wolves,Brentford,2,7,6,7,7,6,13,8,0,6,6,12,1732.723511,1809.577148


> - 3개의 파라미터는 사용자한테 자연어로 입력 받고 이를 처리하는 함수가 필요
>   - return 값이 경기일자, 팀1, 팀2

In [1]:
import requests
from datetime import datetime
from dateutil.parser import parse
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("X_RAPIDAPI_KEY")

def get_fixture_info(api_key, match_date, team1, team2):
    url = "https://api-football-v1.p.rapidapi.com/v3/fixtures"

    headers = {
        "X-RapidAPI-Key": api_key,
        "X-RapidAPI-Host": "api-football-v1.p.rapidapi.com"
    }

    params = {
        "date": match_date,
        "league": 39,
        "season": 2024
    }

    response = requests.get(url, headers=headers, params=params)
    data = response.json()

    for fixture in data["response"]:
        home = fixture["teams"]["home"]["name"].lower()
        away = fixture["teams"]["away"]["name"].lower()

        if team1.lower() in [home, away] and team2.lower() in [home, away]:
            return {
                "home_team": fixture["teams"]["home"]["name"],
                "away_team": fixture["teams"]["away"]["name"]
            }

    return None


def extract_match_parameters(user_input: str) -> dict:
    """
    사용자 입력 → match_date, home_team, away_team (API-Football 기반)
    """

    team_kor_to_eng = {
        "아스널": "Arsenal", "아스톤 빌라": "Aston Villa", "본머스": "Bournemouth", "브렌트포드": "Brentford",
        "브라이턴": "Brighton", "첼시": "Chelsea", "크리스탈 팰리스": "Crystal Palace", "에버턴": "Everton",
        "풀럼": "Fulham", "리즈": "Leeds United", "리버풀": "Liverpool", "맨시티": "Man City",
        "맨체스터 시티": "Man City", "맨유": "Man Utd", "맨체스터 유나이티드": "Man Utd",
        "뉴캐슬": "Newcastle", "노팅엄 포레스트": "Nottingham Forest", "사우샘프턴": "Southampton",
        "토트넘": "Spurs", "스퍼스": "Spurs", "웨스트햄": "West Ham", "울브스": "Wolves", "울버햄튼": "Wolves"
    }

    # 날짜 파싱
    try:
        match_date = parse(user_input, fuzzy=True).date()
    except Exception:
        return {"match_date": None, "home_team": None, "away_team": None}

    # 팀 파싱
    found_teams = []
    for kor, eng in team_kor_to_eng.items():
        if kor in user_input and eng not in found_teams:
            found_teams.append(eng)

    if len(found_teams) < 2:
        return {"match_date": str(match_date), "home_team": None, "away_team": None}

    # home/away 정보는 API를 통해 확정
    fixture = get_fixture_info(api_key, str(match_date), found_teams[0], found_teams[1])
    if fixture:
        return {
            "match_date": str(match_date),
            "home_team": fixture["home_team"],
            "away_team": fixture["away_team"]
        }
    else:
        return {
            "match_date": str(match_date),
            "home_team": None,
            "away_team": None
        }


In [2]:
user_input = "5월 25일 울버햄튼과 브렌트포드 경기 알려줘"

params = extract_match_parameters(user_input)
print(params)

{'match_date': '2025-05-25', 'home_team': 'Wolves', 'away_team': 'Brentford'}


### 완성한 코드사용 예제
- 사용자 입력을 받으면 승부예측을 위한 데이터 수집 및 데이터 프레임 생성

In [None]:
from dataCollector import DataCollector
from match_parser import extract_match_parameters

# 1. 사용자 입력
user_input = "5월 25일 울버햄튼과 브렌트포드 경기"

# 2. 한글 입력 → 날짜, 홈팀, 원정팀 추출
params = extract_match_parameters(user_input)

# 3. 모든 값이 제대로 추출되었을 때만 진행
if all(params.values()):
    collector = DataCollector()
    
    # 4. 수집 실행
    df = collector.collect_features(
        match_date=params["match_date"],
        home_team=params["home_team"],
        away_team=params["away_team"]
    )
    
    print("수집된 데이터:")
    print(df)

else:
    print("날짜 또는 팀 정보를 추출하지 못했습니다.")


✅ 수집된 데이터:
  HomeTeam   AwayTeam  GF3Home  GA3Home  GF5Home  GA5Home  GF3Away  GA3Away  \
0   Wolves  Brentford        2        7        6        7        7        6   

   GF5Away  GA5Away  Form3Home  Form5Home  Form3Away  Form5Away      HomeElo  \
0       13        8          0          6          6         12  1732.723511   

       AwayElo  
0  1809.577148  


In [10]:
df.columns

Index(['HomeTeam', 'AwayTeam', 'GF3Home', 'GA3Home', 'GF5Home', 'GA5Home',
       'GF3Away', 'GA3Away', 'GF5Away', 'GA5Away', 'Form3Home', 'Form5Home',
       'Form3Away', 'Form5Away', 'HomeElo', 'AwayElo'],
      dtype='object')

In [11]:
df

Unnamed: 0,HomeTeam,AwayTeam,GF3Home,GA3Home,GF5Home,GA5Home,GF3Away,GA3Away,GF5Away,GA5Away,Form3Home,Form5Home,Form3Away,Form5Away,HomeElo,AwayElo
0,Wolves,Brentford,2,7,6,7,7,6,13,8,0,6,6,12,1732.723511,1809.577148
