In [23]:
import numpy as np
import pandas as pd
import plotly.express as px

pd.options.display.max_columns=1000

## Объединим файлы

In [24]:
df_train = pd.concat((
    pd.read_parquet(f"aggregated/batch_{i}.parquet")
    for i in range(1, 22 + 1)
))

df_test = pd.concat((
    pd.read_parquet(f"aggregated/batch_{i}.parquet")
    for i in range(23, 25 + 1)
))

## Посмотрим признак

In [25]:
df_train.columns

Index(['GameID', 'Elo', 'Opening', 'ECO', 'NBlunbers', 'NOkayMoves',
       'MeanBlunbers', 'MeanMistakes', 'MeanBadMoves', 'MeanOkayMoves',
       'MoveNumberBlunder', 'MoveNumberMistake', 'MoveNumberBadMove',
       'MeanAbsEval', 'EvalStd', 'NEqualGame300', 'MeanLostGame600',
       'MeanHasMate', 'MeanCentipawnLoss', 'StartCentipawnLoss15',
       'MeanChecks', 'KnightCentipawnLoss', 'PawnCentipawnLoss', 'NMoves',
       'NBestMoves', 'MeanBestMoves'],
      dtype='object')

In [26]:
feature = (df_train["MeanBestMoves"] // 0.02).clip(0, 22)

In [27]:
fig = px.line(
    feature.value_counts().sort_index()
)

fig.data[0].mode = "lines+markers"
fig.update_layout(
    template="plotly_white",
    showlegend=False
)

fig.show()

In [28]:
fig = px.line(
    df_train.groupby(feature).agg({"Elo": "mean"})
)

fig.data[0].mode = "lines+markers"
fig.update_layout(
    template="plotly_white",
    showlegend=False
)

fig.show()

## Сохраняем выводы

In [29]:
def bin_features(df):
    """Делает разбиение на бины (для уменьшения переобучения)"""
        
    # Ошибки
    df["NBlunbers"] = (df["NBlunbers"]).clip(0, 20)
    df["NOkayMoves"] = (df["NOkayMoves"] // 5).clip(0, 30)
    df["MeanBlunbers"] = (df["MeanBlunbers"] // 0.01).clip(0, 40)
    df["MeanMistakes"] = (df["MeanMistakes"] // 0.01).clip(0, 35)
    df["MeanBadMoves"] = (df["MeanBadMoves"] // 0.01).clip(0, 50)
    df["MeanOkayMoves"] = (df["MeanOkayMoves"] // 0.01).clip(40, 95)
    
    # Средний ход ошибок
    df["MoveNumberBlunder"] = (df["MoveNumberBlunder"] // 2).clip(0, 25)
    df["MoveNumberMistake"] = (df["MoveNumberMistake"] // 3).clip(0, 15)
    df["MoveNumberBadMove"] = (df["MoveNumberBadMove"] // 3).clip(0, 15)
    
    # Eval
    df["MeanAbsEval"] = (df["MeanAbsEval"] // 15).clip(0, 65)
    df["EvalStd"] = (df["EvalStd"] // 50).clip(0, 18)
    df["NEqualGame300"] = (df["NEqualGame300"] // 3).clip(0, 30)
    df["MeanLostGame600"] = (df["MeanLostGame600"] // 0.05).clip(0, 18)
    
    # Потери сантипешек
    df["MeanCentipawnLoss"] = (df["MeanCentipawnLoss"] // 10).clip(0, 22)
    df["StartCentipawnLoss15"] = (df["StartCentipawnLoss15"] // 8).clip(0, 30)
    df["KnightCentipawnLoss"] = (df["KnightCentipawnLoss"] // 20).clip(0, 16)
    df["PawnCentipawnLoss"] = (df["PawnCentipawnLoss"] // 10).clip(1, 20)
    
    # Прочее
    df["MeanHasMate"] = (df["MeanHasMate"] // 0.05).clip(0, 8)
    df["MeanChecks"] = (df["MeanChecks"] // 0.02).clip(0, 12)
    df["NMoves"] = (df["NMoves"] // 5).clip(0, 20)
    
    df["MeanBestMoves"] = (df["MeanBestMoves"] // 0.02).clip(0, 22)
    
    return df

In [30]:
bin_features(df_train).to_parquet("datasets/binned_train.parquet")
bin_features(df_test).to_parquet("datasets/binned_test.parquet")