In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

def load_match_data(path):
    df = pd.read_csv(path)
    df["xG_diff"] = df["h_xg"] - df["a_xg"]

    def label_result(r):
        if r["h_goals"] > r["a_goals"]:
            return 2  # 홈 승
        elif r["h_goals"] < r["a_goals"]:
            return 0  # 원정 승
        else:
            return 1  # 무승부

    df["result"] = df.apply(label_result, axis=1)
    return df

def train_model(df):
    X = df[["h_xg", "a_xg", "xG_diff"]]
    y = df["result"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=3)
    return clf, acc, report

def main():
    df = load_match_data("2021-2022_xG.csv")  

    df.to_csv("epl_2021.csv", index=False)  
    print("[✅ 파일 저장됨] epl_2021.csv")

    clf, acc, report = train_model(df)
    print(f"\n✅ 모델 정확도: {acc:.3f}\n")
    print(report)


if __name__ == "__main__":
    main()