<a href="https://colab.research.google.com/github/CasiCode/Market-Value-Regression/blob/main/LGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

EDA и комментарии о предобработке данных доступны в версии блокнота с CatBoost. Этот блокнот в свою очередь ориентирован чисто на построение альтернативной модели с использованием LGBM.

In [1]:
%pip install category-encoders

Collecting category-encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category-encoders
Successfully installed category-encoders-2.8.1


In [2]:
import kagglehub

In [3]:
import numpy as np

In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb

from category_encoders import TargetEncoder

In [5]:
path = kagglehub.dataset_download("davidcariboo/player-scores")
games = pd.read_csv(f"{path}/games.csv")
appearances = pd.read_csv(f"{path}/appearances.csv")
players = pd.read_csv(f"{path}/players.csv")

In [6]:
players = players[players.market_value_in_eur.isnull() == False]

In [7]:
games_and_aps = appearances.merge(games, on=["game_id"], how="left")

def player_stats(season, df):
    df = df.copy()

    is_home = df['home_club_id'] == df['player_club_id']
    is_away = df['away_club_id'] == df['player_club_id']

    df["goals_for"] = np.where(is_home, df['home_club_goals'],
                               np.where(is_away, df['away_club_goals'], np.nan))

    df["goals_against"] = np.where(is_home, df['away_club_goals'],
                               np.where(is_away, df['home_club_goals'], np.nan))

    df["clean_sheet"] = (df['goals_against'] == 0).astype(float)

    df_all = df.groupby(['player_id'], as_index=False).agg({
        'goals': 'sum',
        'game_id': 'nunique',
        'assists': 'sum',
        'minutes_played': 'sum',
        'goals_for': 'sum',
        'goals_against': 'sum',
        'clean_sheet': 'sum'
    }).rename(columns={
        'game_id': 'games',
        'goals': 'goals_total',
        'assists': 'assists_total',
        'minutes_played': 'minutes_played_total',
        'goals_for': 'goals_for_total',
        'goals_against': 'goals_against_total',
        'clean_sheet': 'clean_sheet_total'
    })

    df_season = df[df['season'] == season].groupby(['player_id'], as_index=False).agg({
        'goals': 'sum',
        'game_id': 'nunique',
        'assists': 'sum',
        'minutes_played': 'sum',
        'goals_for': 'sum',
        'goals_against': 'sum',
        'clean_sheet': 'sum'
    })

    df_season = df_season.add_suffix(f'_{season}')
    df_season = df_season.rename(columns={
        f'player_id_{season}': 'player_id',
        f'game_id_{season}': f'games_{season}'
    })

    stats = df_all.merge(df_season, on='player_id', how='left')
    stats.fillna(0, inplace=True)

    return stats

In [8]:
season = 2023
stats = player_stats(season, games_and_aps)

players = players.merge(stats, on='player_id', how='left')

In [None]:
players['date_of_birth'] = pd.to_datetime(players['date_of_birth'])

players = players[players['date_of_birth'].isnull() == False]
now = datetime.now()
players['age'] = (now - players['date_of_birth']).apply(lambda x: x.days) / 365.25
players['age'] = players['age'].round().astype(int)

players['contract_expiration_date'] = pd.to_datetime(players['contract_expiration_date'], errors='coerce')
now = pd.Timestamp.now()
players['days_to_expire'] = (players['contract_expiration_date'] - now).dt.days
players['days_to_expire'] = players['days_to_expire'].clip(lower=0).fillna(0).astype(int)

In [10]:
players.shape

(31046, 39)

In [11]:
list(players)

['player_id',
 'first_name',
 'last_name',
 'name',
 'last_season',
 'current_club_id',
 'player_code',
 'country_of_birth',
 'city_of_birth',
 'country_of_citizenship',
 'date_of_birth',
 'sub_position',
 'position',
 'foot',
 'height_in_cm',
 'contract_expiration_date',
 'agent_name',
 'image_url',
 'url',
 'current_club_domestic_competition_id',
 'current_club_name',
 'market_value_in_eur',
 'highest_market_value_in_eur',
 'goals_total',
 'games',
 'assists_total',
 'minutes_played_total',
 'goals_for_total',
 'goals_against_total',
 'clean_sheet_total',
 'goals_2023',
 'games_2023',
 'assists_2023',
 'minutes_played_2023',
 'goals_for_2023',
 'goals_against_2023',
 'clean_sheet_2023',
 'age',
 'days_to_expire']

In [12]:
players = players.drop(columns=['player_id', 'first_name', 'last_name', 'name', 'last_season',
                      'player_code', 'city_of_birth', 'contract_expiration_date',
                      'date_of_birth', 'agent_name', 'sub_position', 'image_url', 'url',
                      'current_club_name', 'highest_market_value_in_eur'])

In [13]:
players = players.rename(columns={
    'current_club_domestic_competition_id': 'domestic_competition_id'
    })

In [14]:
pd.set_option('display.max_columns', 500)
players.head()

Unnamed: 0,current_club_id,country_of_birth,country_of_citizenship,position,foot,height_in_cm,domestic_competition_id,market_value_in_eur,goals_total,games,assists_total,minutes_played_total,goals_for_total,goals_against_total,clean_sheet_total,goals_2023,games_2023,assists_2023,minutes_played_2023,goals_for_2023,goals_against_2023,clean_sheet_2023,age,days_to_expire
0,398,Poland,Germany,Attack,right,184.0,IT1,1000000.0,48.0,136.0,25.0,8808.0,212.0,153.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47,0
1,16,Germany,Germany,Goalkeeper,left,190.0,L1,750000.0,0.0,152.0,0.0,13508.0,324.0,189.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45,0
2,1091,Bulgaria,Bulgaria,Attack,,,GR1,1000000.0,38.0,122.0,13.0,8788.0,140.0,145.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44,0
3,506,Brazil,Brazil,Defender,,,IT1,200000.0,0.0,4.0,0.0,307.0,11.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47,0
4,27,East Germany (GDR),Germany,Goalkeeper,right,194.0,L1,100000.0,0.0,12.0,0.0,1080.0,31.0,12.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44,0


In [15]:
players.shape

(31046, 24)

Здесь дропаем две фичи из-за около нулевого feature-importance (код для важности фич в комментариях внутри цикла по фолдам)

In [16]:
players = pd.get_dummies(players, columns=['position'])
players = pd.get_dummies(players, columns=['foot'])

players = players.drop(columns=['position_Midfield', 'position_Missing'])

In [17]:
top_n = 30
for col in ['country_of_citizenship', 'country_of_birth']:
    top_categories = players[col].value_counts().nlargest(top_n).index
    players[col] = players[col].where(players[col].isin(top_categories), 'Other')

train, test = train_test_split(players, test_size=0.2, random_state=50)

q_hi = train['market_value_in_eur'].quantile(0.99)
q_lo = train['market_value_in_eur'].quantile(0.05)
train = train[(train['market_value_in_eur'] >= q_lo) & (train['market_value_in_eur'] <= q_hi)]

train['market_value_in_eur'] = np.log1p(train['market_value_in_eur'])

targets = train['market_value_in_eur'].values

LGBM, в отличие от CatBoost, не так хорошо работает с категориальными фичами "из коробки". По этой причине все, что мы еще не заэнкодили OHE, мы будем таргет-энкодить - это приносит хорошие результаты и оправдано, учитывая сильное влияние фичей на таргет и большое количество уникальных значений.

За лосс возьмем Tweedie. Он устойчив к выбросам, которых в этом датасете очень много.

TargetEncoder'ы делаем собственные для каждого фолда и каждой фичи, чтобы избежать утечки данных из таргета и переобучения модели.

In [18]:
bins = pd.qcut(train['market_value_in_eur'], q=5, labels=False)
cv = StratifiedKFold(n_splits=5, random_state=100, shuffle=True)
trained = np.zeros(len(train))
oof = np.zeros(len(train))

tested = np.zeros(len(test))

model_parameters = {
    'objective': 'tweedie',
    'tweedie_variance_power': 1.3,
    'n_estimators': 200,
    'verbose': -1,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'num_leaves': 26,
    'early_stopping_rounds': 23,
}

for fold_, (train_idx, val_idx) in enumerate(cv.split(train, bins), 1):
    print(f'Training with fold {fold_} started')

    lgb_model = lgb.LGBMRegressor(**model_parameters)

    train_, val = train.iloc[train_idx].copy(), train.iloc[val_idx].copy()

    feat_cols = list(train.columns.values)
    feat_cols.remove('market_value_in_eur')

    te_club = TargetEncoder(cols=['current_club_id'], smoothing=10)
    train_[['current_club_id']] = te_club.fit_transform(train_[['current_club_id']], train_['market_value_in_eur'])
    val['current_club_id'] = te_club.transform(val[['current_club_id']])

    te_com = TargetEncoder(cols=['domestic_competition_id'], smoothing=10)
    train_[['domestic_competition_id']] = te_com.fit_transform(train_[['domestic_competition_id']], train_['market_value_in_eur'])
    val['domestic_competition_id'] = te_com.transform(val[['domestic_competition_id']])

    te_state = TargetEncoder(cols=['country_of_citizenship'], smoothing=10)
    train_[['country_of_citizenship']] = te_state.fit_transform(train_[['country_of_citizenship']], train_['market_value_in_eur'])
    val['country_of_citizenship'] = te_state.transform(val[['country_of_citizenship']])

    te_birth = TargetEncoder(cols=['country_of_birth'], smoothing=10)
    train_[['country_of_birth']] = te_birth.fit_transform(train_[['country_of_birth']], train_['market_value_in_eur'])
    val['country_of_birth'] = te_birth.transform(val[['country_of_birth']])

    lgb_model.fit(train_[feat_cols], train_['market_value_in_eur'],
                  eval_set=[(val[feat_cols], val['market_value_in_eur'])],
                  eval_metric=['huber'])
    '''
    model.fit(train[feat_cols], train['market_value_in_eur'],
          eval_set=(val[feat_cols], val['market_value_in_eur']),
          early_stopping_rounds=50)
    '''

    trained[train_idx] = lgb_model.predict(train_[feat_cols])

    oof[val_idx] = lgb_model.predict(val[feat_cols])

    #for feat, importance in zip(train.columns, lgb_model.feature_importances_):
    #  print(f'feature: {feat}, importance: {importance}')
    #print('\n')

    #models.append(lgb_model)

    #print(f'Training with fold {fold_} completed\n')

true = np.expm1(targets)
pred = np.expm1(trained)
pred_oof = np.expm1(oof)

rmsle = mean_squared_log_error(true, pred) ** 0.5
rmsle_oof = mean_squared_log_error(true, pred_oof) ** 0.5

print('Train MAE: ', mean_absolute_error(true, pred))
print('CV MAE: ', mean_absolute_error(true, pred_oof), '\n')
print('Train MAPE: ', mean_absolute_percentage_error(true, pred))
print('CV MAPE: ', mean_absolute_percentage_error(true, pred_oof), '\n')
print('TRAIN RMSLE: ', rmsle)
print('CV RMSLE: ', rmsle_oof)

val_df = pd.DataFrame({
    'true': true,
    'pred': pred_oof
})


val_df['quantile'] = pd.qcut(val_df['true'], q=5)

for q, group in val_df.groupby('quantile'):
    print(f"\n{q}:")
    print("  MAE: ", mean_absolute_error(group['true'], group['pred']))
    print("  MAPE:", mean_absolute_percentage_error(group['true'], group['pred']))

Training with fold 1 started
Training with fold 2 started
Training with fold 3 started
Training with fold 4 started
Training with fold 5 started
Train MAE:  463712.8248257569
CV MAE:  579707.792547966 

Train MAPE:  0.5208532708440574
CV MAPE:  0.6350068769666887 

TRAIN RMSLE:  0.6023520151980167
CV RMSLE:  0.7031266601093429

(49999.999, 100000.0]:
  MAE:  77414.5771713096
  MAPE: 1.1451627958554622

(100000.0, 200000.0]:
  MAE:  74465.7951559533
  MAPE: 0.43733734253797246

(200000.0, 350000.0]:
  MAE:  126277.29422144573
  MAPE: 0.4357198433316469

(350000.0, 1000000.0]:
  MAE:  330283.75758527074
  MAPE: 0.5114406513173796

(1000000.0, 30000000.0]:
  MAE:  2550455.9785118834
  MAPE: 0.49731601074150045


  for q, group in val_df.groupby('quantile'):
