In [3]:
import numpy as np
import pandas as pd
import plotly.express as px

## Добавляем признаки

In [4]:
def generate_features(df_games, df_moves):
    """Создает признаки"""
    
    # !Ошибки
    df_moves["IsInaccuracy"] = df_moves["Move"].str.endswith("?!")
    df_moves["IsBlunder"] = df_moves["Move"].str.endswith("??")
    df_moves["IsMistake"] = df_moves["Move"].str.endswith("?") & (~df_moves["Move"].str.endswith("??"))
    df_moves["IsWrongMove"] = df_moves["IsInaccuracy"] | df_moves["IsBlunder"] | df_moves["IsMistake"]
    df_moves["IsBadMove"] = df_moves["IsBlunder"] | df_moves["IsMistake"]
    df_moves["IsOkayMove"] = ~(df_moves["IsInaccuracy"] | df_moves["IsBlunder"] | df_moves["IsMistake"])
    
    # !Средний ход ошибок
    df_moves["MoveNumberInaccuracy"] = df_moves["MoveNumber"].where(df_moves["IsInaccuracy"])
    df_moves["MoveNumberBlunder"] = df_moves["MoveNumber"].where(df_moves["IsBlunder"])
    df_moves["MoveNumberMistake"] = df_moves["MoveNumber"].where(df_moves["IsMistake"])
    df_moves["MoveNumberWrongMove"] = df_moves["MoveNumber"].where(df_moves["IsWrongMove"])
    df_moves["MoveNumberBadMove"] = df_moves["MoveNumber"].where(df_moves["IsBadMove"])
    df_moves["MoveNumberOkayMove"] = df_moves["MoveNumber"].where(df_moves["IsOkayMove"])
    
    # !Маты
    df_moves["MateIn"] = df_moves["Eval"].where(
        df_moves["Eval"].str.startswith("#"),
        "#0"
    )
    df_moves["MateIn"] = df_moves["MateIn"].str[1:].astype(float)
    # Возьмем маты через 5 ходов
    df_moves["HasMate"] = (df_moves["MateIn"].abs() >= 1) & (df_moves["MateIn"].abs() <= 5)
    
    # !EVAL
    df_moves["EvalCentipawn"] = pd.to_numeric(df_moves["Eval"], errors="coerce").multiply(100).round()
    # Обрежем оценку позиции на +- 12 пешек
    df_moves["EvalCentipawn"] = df_moves["EvalCentipawn"].clip(-1200, 1200)
    # Мат считаем как максимальное преимущество
    df_moves.loc[df_moves["MateIn"] > 0, "EvalCentipawn"] = 1200
    df_moves.loc[df_moves["MateIn"] < 0, "EvalCentipawn"] = -1200
    
    df_moves["AbsEval"] = df_moves["EvalCentipawn"].abs()
    df_moves["IsEqualGame300"] = df_moves["EvalCentipawn"].abs() <= 300
    df_moves["IsLostGame600"] = df_moves["EvalCentipawn"].abs() >= 600
    
    # !Diff
    df_moves["CentipawnLoss"] = df_moves["EvalCentipawn"].diff().abs().fillna(0)
    df_moves.loc[(df_moves["MoveNumber"] == 1), "CentipawnLoss"] = 0
    df_moves["StartCentipawnLoss15"] = df_moves["CentipawnLoss"].where(df_moves["MoveNumber"] <= 15)
    
    # !Шахи
    df_moves["IsCheck"] = df_moves["Move"].str.contains("+", regex=False)
    df_moves["IsCapture"] = df_moves["Move"].str.contains("x", regex=False)
    
    # !В разрезе фигур и ходов
    df_moves["Piece"] = df_moves["Move"].str[0]
    df_moves["KnightCentipawnLoss"] = df_moves["CentipawnLoss"].where(df_moves["Piece"] == "N")
    df_moves["PawnCentipawnLoss"] = df_moves["CentipawnLoss"].where(df_moves["Piece"].str.islower())
    
    # Лучший ход
    df_moves["IsBestMove"] = (df_moves["CentipawnLoss"] == 0)
    
    # !Агрегируем
    agg = df_moves.groupby("GameID").agg(
        
        NBlunbers=("IsBlunder", "sum"),
        NOkayMoves=("IsOkayMove", "sum"),
        
        MeanBlunbers=("IsBlunder", "mean"),
        MeanMistakes=("IsMistake", "mean"),
        MeanBadMoves=("IsBadMove", "mean"),
        MeanOkayMoves=("IsOkayMove", "mean"),
        
        MoveNumberBlunder=("MoveNumberBlunder", "mean"),
        MoveNumberMistake=("MoveNumberMistake", "mean"),
        MoveNumberBadMove=("MoveNumberBadMove", "mean"),
        
        MeanAbsEval=("AbsEval", "mean"),
        EvalStd=("EvalCentipawn", "std"),
        NEqualGame300=("IsEqualGame300", "sum"),
        MeanLostGame600=("IsLostGame600", "mean"),
        
        MeanHasMate=("HasMate", "mean"),
        
        MeanCentipawnLoss=("CentipawnLoss", "mean"),
        StartCentipawnLoss15=("StartCentipawnLoss15", "mean"),        
        
        MeanChecks=("IsCheck", "mean"),
        
        KnightCentipawnLoss=("KnightCentipawnLoss", "mean"),
        PawnCentipawnLoss=("PawnCentipawnLoss", "mean"),
        
        NMoves=("MoveNumber", "max"),
        
        NBestMoves=("IsBestMove", "sum"),
        MeanBestMoves=("IsBestMove", "mean")
    )
    
    # NaN могут быть, если особых ходов не было в партии
    # (например, ходов конем в KnightCentipawnLoss)
    # Поэтому пропуски заменяем нулями    
    agg = agg.fillna(0)
    
    get_mean_elo = lambda df: (df["WhiteElo"] + df["BlackElo"]) // 2
    
    df_features = (
        df_games
        .assign(Elo=get_mean_elo)
        .loc[:, ["GameID", "Elo", "Opening", "ECO"]]
    )

    df_features = df_features.merge(agg, on="GameID")
    
    return df_features

In [5]:
n_batches = 25
for batch in range(1, n_batches + 1):
    
    df_games = pd.read_parquet(f"parsed/batch_{batch}_games.parquet")
    df_moves = pd.read_parquet(f"parsed/batch_{batch}_moves.parquet")

    df_features = generate_features(df_games, df_moves)

    df_features.to_parquet(f"aggregated/batch_{batch}.parquet")
    
    print(f"#{batch}")

#1
#2
#3
#4
#5
#6
#7
#8
#9
#10
#11
#12
#13
#14
#15
#16
#17
#18
#19
#20
#21
#22
#23
#24
#25


## Корзина

In [None]:
# Отфильтрованные признаки

In [None]:
# def get_quntile(a):
#     return lambda x: np.quantile(x, a)

In [None]:
# df_moves["IsInaccuracy"] = df_moves["Move"].str.endswith("?!")
# df_moves["IsBlunder"] = df_moves["Move"].str.endswith("??")
# df_moves["IsMistake"] = df_moves["Move"].str.endswith("?") & (~df_moves["Move"].str.endswith("??"))
# df_moves["IsWrongMove"] = df_moves["IsInaccuracy"] | df_moves["IsBlunder"] | df_moves["IsMistake"]
# df_moves["IsBadMove"] = df_moves["IsBlunder"] | df_moves["IsMistake"]
# df_moves["IsOkayMove"] = ~(df_moves["IsInaccuracy"] | df_moves["IsBlunder"] | df_moves["IsMistake"])

# NInaccuracies=("IsInaccuracy", "sum"),
# NBlunbers=("IsBlunder", "sum"),
# NMistakes=("IsMistake", "sum"),
# NWrongMoves=("IsWrongMove", "sum"),
# NBadMoves=("IsBadMove", "sum"),
# NOkayMoves=("IsOkayMove", "sum"),

# MeanInaccuracies=("IsInaccuracy", "mean"),
# MeanBlunbers=("IsBlunder", "mean"),
# MeanMistakes=("IsMistake", "mean"),
# MeanWrongMoves=("IsWrongMove", "mean"),
# MeanBadMoves=("IsBadMove", "mean"),
# MeanOkayMoves=("IsOkayMove", "mean")

In [None]:
# df_moves["MoveNumberInaccuracy"] = where(df_moves["IsInaccuracy"], df_moves["MoveNumber"])
# df_moves["MoveNumberBlunder"] = where(df_moves["IsBlunder"], df_moves["MoveNumber"])
# df_moves["MoveNumberMistake"] = where(df_moves["IsMistake"], df_moves["MoveNumber"])
# df_moves["MoveNumberWrongMove"] = where(df_moves["IsWrongMove"], df_moves["MoveNumber"])
# df_moves["MoveNumberBadMove"] = where(df_moves["IsBadMove"], df_moves["MoveNumber"])
# df_moves["MoveNumberOkayMove"] = where(df_moves["IsOkayMove"], df_moves["MoveNumber"])

# MoveNumberInaccuracy=("MoveNumberInaccuracy", "mean"),
# MoveNumberBlunder=("MoveNumberBlunder", "mean"),
# MoveNumberMistake=("MoveNumberMistake", "mean"),
# MoveNumberWrongMove=("MoveNumberWrongMove", "mean"),
# MoveNumberBadMove=("MoveNumberBadMove", "mean"),
# MoveNumberOkayMove=("MoveNumberOkayMove", "mean"),

In [None]:
# df_moves["AbsEval"] = df_moves["EvalCentipawn"].abs()
# df_moves["IsEqualGame100"] = df_moves["EvalCentipawn"].abs() <= 100
# df_moves["IsEqualGame200"] = df_moves["EvalCentipawn"].abs() <= 200
# df_moves["IsEqualGame300"] = df_moves["EvalCentipawn"].abs() <= 300

# MeanAbsEval=("AbsEval", "mean"),
# NEqualGame100=("IsEqualGame100", "sum"),
# NEqualGame200=("IsEqualGame200", "sum"),
# NEqualGame300=("IsEqualGame300", "sum"),
# MeanEqualGame100=("IsEqualGame100", "mean"),
# MeanEqualGame200=("IsEqualGame200", "mean"),
# MeanEqualGame300=("IsEqualGame300", "mean"),

In [None]:
# df_moves["IsLostGame800"] = df_moves["EvalCentipawn"].abs() >= 800
# df_moves["IsLostGame700"] = df_moves["EvalCentipawn"].abs() >= 700
# df_moves["IsLostGame600"] = df_moves["EvalCentipawn"].abs() >= 600

# NLostGame800=("IsLostGame800", "sum"),
# NLostGame700=("IsLostGame700", "sum"),
# NLostGame600=("IsLostGame600", "sum"),
# MeanLostGame800=("IsLostGame800", "mean"),
# MeanLostGame700=("IsLostGame700", "mean"),
# MeanLostGame600=("IsLostGame600", "mean"),

In [None]:
# df_moves["IsCheck"] = df_moves["Move"].str.contains("+", regex=False)
# df_moves["IsCapture"] = df_moves["Move"].str.contains("x", regex=False)
# NChecks=("IsCheck", "sum")
# NCaptures=("IsCapture", "sum"),
# MeanCaptures=("IsCapture", "mean")

In [None]:
# df_moves["Piece"] = df_moves["Move"].str[0]
# df_moves["QueenCentipawnLoss"] = where(df_moves["Piece"] == "Q", df_moves["CentipawnLoss"])
# df_moves["RookCentipawnLoss"] = where(df_moves["Piece"] == "R", df_moves["CentipawnLoss"])
# df_moves["KnightCentipawnLoss"] = where(df_moves["Piece"] == "N", df_moves["CentipawnLoss"])
# df_moves["BishopCentipawnLoss"] = where(df_moves["Piece"] == "B", df_moves["CentipawnLoss"])
# df_moves["CheckCentipawnLoss"] = where(df_moves["IsCheck"], df_moves["CentipawnLoss"])
# df_moves["CaptureCentipawnLoss"] = where(df_moves["IsCheck"], df_moves["CentipawnLoss"])

# QueenCentipawnLoss=("QueenCentipawnLoss", "mean"),
# RookCentipawnLoss=("RookCentipawnLoss", "mean"),
# KnightCentipawnLoss=("KnightCentipawnLoss", "mean"),
# BishopCentipawnLoss=("BishopCentipawnLoss", "mean"),
# CheckCentipawnLoss=("CheckCentipawnLoss", "mean"),
# CaptureCentipawnLoss=("CaptureCentipawnLoss", "mean"),

In [None]:
# df_moves["EarlyCentipawnLoss10"] = where(df_moves["MoveNumber"] <= 10, df_moves["CentipawnLoss"])
# df_moves["EarlyCentipawnLoss20"] = where(df_moves["MoveNumber"] <= 20, df_moves["CentipawnLoss"])
# df_moves["EarlyCentipawnLoss30"] = where(df_moves["MoveNumber"] <= 30, df_moves["CentipawnLoss"])
# df_moves["EarlyCentipawnLoss40"] = where(df_moves["MoveNumber"] <= 40, df_moves["CentipawnLoss"])
# df_moves["EarlyCentipawnLoss50"] = where(df_moves["MoveNumber"] <= 50, df_moves["CentipawnLoss"])
# df_moves["EarlyCentipawnLoss60"] = where(df_moves["MoveNumber"] <= 60, df_moves["CentipawnLoss"])

# EarlyCentipawnLoss30=("EarlyCentipawnLoss30", "mean"),
# EarlyCentipawnLoss40=("EarlyCentipawnLoss40", "mean"),
# EarlyCentipawnLoss50=("EarlyCentipawnLoss50", "mean"),
# EarlyCentipawnLoss60=("EarlyCentipawnLoss60", "mean"),

In [None]:
# df_moves["EarlyCentipawnLoss5"] = where(df_moves["MoveNumber"] <= 5, df_moves["CentipawnLoss"])
# df_moves["EarlyCentipawnLoss10"] = where(df_moves["MoveNumber"] <= 10, df_moves["CentipawnLoss"])
# df_moves["EarlyCentipawnLoss15"] = where(df_moves["MoveNumber"] <= 15, df_moves["CentipawnLoss"])
# df_moves["MiddleCentipawnLoss1020"] = where((df_moves["MoveNumber"] > 10) & (df_moves["MoveNumber"] <= 20), df_moves["CentipawnLoss"])

# EarlyCentipawnLoss10=("EarlyCentipawnLoss10", "mean"),
# MiddleCentipawnLoss1020=("MiddleCentipawnLoss1020", "mean"),
# EarlyCentipawnLoss5=("EarlyCentipawnLoss5", "mean"),
# EarlyCentipawnLoss15=("EarlyCentipawnLoss15", "mean"),   