In [2]:
import pandas as pd
import numpy as np
import json
import warnings
from tqdm import tqdm
import os
from datetime import datetime, timedelta
warnings.simplefilter(action='ignore', category=FutureWarning)

#### 선수가 트레이드 직전/직후 속했던 팀의 스탯만 정제

In [2]:
# 2020년대 트레이드 내역 데이터 불러오기
with open("player_moves/trade_list_2020s.json") as f:
    trades = json.load(f)

# 2020년대 지명권 트레이드 결과 데이터 불러오기
draft_tickets = pd.read_csv("player_moves/draft_tickets_2020s.csv", keep_default_na=False)

# 2020년대 트레이드 선수 기본 데이터 불러오기
player_info = pd.read_csv("player_stats/player_basic_info.csv", index_col = 0)
player_info["statizId"] = player_info["statizId"].astype(str)

In [3]:
player_info.head(3)

Unnamed: 0,statizId,이름,팀,주포지션,투타,생년월일,출신학교,신인지명,활약년도,활약팀
0,10404,장영석,KIA,3B,우투우타,1990년 05월 14일,신도초-성남중-부천고-방송통신대,09 히어로즈 2차 1라운드 3순위,2009년 ~ 2021년,넥센-KIA
1,11260,박준태,키움,CF,우투좌타,1991년 07월 26일,대연초-부산중-개성고-인하대,14 KIA 2차 6라운드 61순위,2014년 ~ 2023년,KIA-키움
2,13094,추재현,롯데,RF,좌투좌타,1999년 02월 22일,성동구-건대부중-신일고,18 넥센 2차 3라운드 28순위,2018년 ~ 2024년,키움-롯데


In [4]:
def compare_dates(date1, date2):
    # 문자열을 날짜 객체로 변환
    date_format = "%Y-%m-%d"
    date_obj1 = datetime.strptime(date1, date_format)
    date_obj2 = datetime.strptime(date2, date_format)
    
    # 날짜 비교
    return date_obj1 < date_obj2

In [5]:
def before_after_none(ser, before_start, before_end, after_start, after_end):
    return ser.apply(lambda x : "before" if (before_start <= x) & (x <= before_end) \
        else "after" if (after_start <= x) & (x <= after_end) else None)

In [6]:
def stats_filtering(trade, name, player_id, game_type, player_type, before_start, before_end, after_start, after_end):
    try:
        batting_path = f"player_stats/daily_stats/{game_type}/{player_type}_stats/{name}_{player_id}_dailyStats.csv"
        batting_df = pd.read_csv(batting_path, index_col = 0) 
        batting_df["날짜"] = pd.to_datetime(batting_df["날짜"])
        batting_df["trade_team"] = \
            before_after_none(batting_df["날짜"], before_start, before_end, after_start, after_end)
        batting_df.to_csv(f"player_stats_processed/daily_stats/{game_type}/{player_type}_stats/{name}_{player_id}_{trade['id']}_dailyStats.csv")
    except:
        print(f"[batting] {name}_{player_id}_dailyStats.csv 파일이 없습니다.")
        return

#### 날짜별 성적 필터링 - 정규시즌

In [7]:
batter = ["1B", "2B", "3B", "SS", "LF", "CF", "RF", "C"]
date_format = "%Y-%m-%d"

In [8]:
for trade in tqdm(trades):
    trade_date = trade["date"]
    for x in trade["playerA"] + trade["playerB"]:
        # (1) 선수 기본정보 불러오기
        if x["type"] == "money":
            continue
        elif x["type"] == "draft":
            player_id = draft_tickets.loc[(draft_tickets["id"] == trade["id"]) & \
                + (draft_tickets["지명라운드"] == x["round"]), "statizId"].item()
            name = draft_tickets.loc[(draft_tickets["id"] == trade["id"]) & \
                + (draft_tickets["지명라운드"] == x["round"]), "선수"].item()
            if player_id == '':
                continue
        else:
            name = x["name"]
            player_id = x["statizId"]
        position = \
            player_info.loc[(player_info["이름"] == name) & 
                    (player_info["statizId"] == player_id), "주포지션"].item()

        # (2) 선수 이적정보 (소속 팀 이력) 불러오기
        try:
            team_history = f"player_moves/{name}_{player_id}_team_history.json"
            with open(team_history, encoding='utf-8-sig') as f:
                moves = json.load(f)
        except:
            print(f"{name}_{player_id} 선수의 팀 이적 정보가 확인되지 않습니다.")
            continue
        for idx, move in enumerate(moves["team_history"]):     
            if datetime.strptime(trade_date, date_format) <= datetime.strptime(move["start_date"], date_format):
                before_start = datetime.strptime(moves["team_history"][idx-1]["start_date"], date_format)
                after_start = datetime.strptime(move["start_date"], date_format)
                before_end = after_start - timedelta(days=1)
                try:
                    after_end = datetime.strptime(moves["team_history"][idx+1]["start_date"], date_format) - timedelta(days=1)
                except:
                    after_end = datetime.today()
                finally:
                    break

        # (3) 선수 트레이드 직전/직후만 필터링
        if position in batter:
            stats_filtering(trade, name, player_id, "regular", "batting", before_start, before_end, after_start, after_end)
        else:
            stats_filtering(trade, name, player_id, "regular", "pitching", before_start, before_end, after_start, after_end)

100%|██████████| 40/40 [00:01<00:00, 25.43it/s]


#### 날짜별 성적 필터링 - 포스트시즌

In [9]:
for trade in tqdm(trades):
    trade_date = trade["date"]
    for x in trade["playerA"] + trade["playerB"]:
        # (1) 선수 기본정보 불러오기
        if x["type"] == "money":
            continue
        elif x["type"] == "draft":
            player_id = draft_tickets.loc[(draft_tickets["id"] == trade["id"]) & \
                + (draft_tickets["지명라운드"] == x["round"]), "statizId"].item()
            name = draft_tickets.loc[(draft_tickets["id"] == trade["id"]) & \
                + (draft_tickets["지명라운드"] == x["round"]), "선수"].item()
            if player_id == '':
                continue
        else:
            name = x["name"]
            player_id = x["statizId"]
        position = \
            player_info.loc[(player_info["이름"] == name) & 
                    (player_info["statizId"] == player_id), "주포지션"].item()

        # (2) 선수 이적정보 (소속 팀 이력) 불러오기
        try:
            team_history = f"player_moves/{name}_{player_id}_team_history.json"
            with open(team_history, encoding='utf-8-sig') as f:
                moves = json.load(f)
        except:
            print(f"{name}_{player_id} 선수의 팀 이적 정보가 확인되지 않습니다.")
            continue
        for idx, move in enumerate(moves["team_history"]):     
            if datetime.strptime(trade_date, date_format) <= datetime.strptime(move["start_date"], date_format):
                before_start = datetime.strptime(moves["team_history"][idx-1]["start_date"], date_format)
                after_start = datetime.strptime(move["start_date"], date_format)
                before_end = after_start - timedelta(days=1)
                try:
                    after_end = datetime.strptime(moves["team_history"][idx+1]["start_date"], date_format) - timedelta(days=1)
                except:
                    after_end = datetime.today()
                finally:
                    break

        # (3) 선수 트레이드 직전/직후만 필터링
        if position in batter:
            stats_filtering(trade, name, player_id, "postseason", "batting", before_start, before_end, after_start, after_end)
        else:
            stats_filtering(trade, name, player_id, "postseason", "pitching", before_start, before_end, after_start, after_end)

100%|██████████| 40/40 [00:00<00:00, 51.25it/s]


In [49]:
for path1 in os.listdir("player_stats"):
    try:
        for path2 in os.listdir(f"player_stats/{path1}"):
            for path3 in os.listdir(f"player_stats/{path1}/{path2}"):
                try:
                    for path4 in os.listdir(f"player_stats/{path1}/{path2}/{path3}"):
                        path = f"player_stats/{path1}/{path2}/{path3}/{path4}"
                        df = pd.read_csv(path, index_col=0)
                        df = df.loc[:,list(filter(lambda x : "Unnamed" not in x, list(df.columns)))]
                        df["Team"] = df["Team"].apply(lambda x : "키움" if x=="넥센" else x)
                        df["Team"] = df["Team"].apply(lambda x : "키움" if x=="우리" else x)
                        df["Team"] = df["Team"].apply(lambda x : "SSG" if x=="SK" else x)
                        df.to_csv(path)
                except:
                    pass
    except:
        pass