In [3]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from colorama import Fore, Style
import lightgbm, xgboost, catboost
import pickle

from sklearn.base import clone, BaseEstimator, TransformerMixin
from sklearn.model_selection import GroupKFold, GroupShuffleSplit, cross_val_score, cross_val_predict
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PolynomialFeatures, SplineTransformer
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, BaggingRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, mean_absolute_error,root_mean_squared_error

pd.options.mode.chained_assignment = "raise"

saved_models, oof_pred = {}, {}

In [4]:
train = pl.read_csv('datasets/train.csv')
print('Shape before dropping columns:', train.shape)

constant_columns = np.array(train.columns)[train.select(pl.all().n_unique() == 1).to_numpy().ravel()]
print(len(constant_columns), 'columns are constant. These will be dropped.')

drop_columns = list(constant_columns) + ['Id']

train = train.drop(drop_columns)
print('Shape after dropping columns:', train.shape)

# Null values
print('There are', train.null_count().to_numpy().sum(), 'missing values.')

# Duplicates
print('There are', len(train) - train.n_unique(), 'duplicates.')

# Boolean columns
print('There are', train.select(pl.all().n_unique() == 2).to_numpy().sum(), 'binary columns.')

# train

Shape before dropping columns: (233234, 814)
216 columns are constant. These will be dropped.
Shape after dropping columns: (233234, 597)
There are 0 missing values.
There are 0 duplicates.
There are 382 binary columns.


In [5]:
%%writefile preprocess.py

import polars as pl
import pandas as pd

def preprocess(df_polars):
    """Convert the polars dataframe to pandas; extract target and groups if it is the training dataframe
    
    The function should be applied to training and test datasets.
    
    Parameters
    df_polars: polars DataFrame (train or test)
    
    Return values:
    df: pandas DataFrame with all features of shape (n_samples, n_features)
    target: target array of shape (n_samples, ) or None
    groups: grouping array for GroupKFold of shape (n_samples, ) or None
    """
    global cat_mapping
    
    # Add eight features extracted from player names,
    # Drop GameRulesetName, freetext and target columns
    df = df_polars.with_columns(
        pl.col('agent1').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 1).alias('p1_selection'),
        pl.col('agent1').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 2).alias('p1_exploration').cast(pl.Float32),
        pl.col('agent1').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 3).alias('p1_playout'),
        pl.col('agent1').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 4).alias('p1_bounds'),
        pl.col('agent2').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 1).alias('p2_selection'),
        pl.col('agent2').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 2).alias('p2_exploration').cast(pl.Float32),
        pl.col('agent2').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 3).alias('p2_playout'),
        pl.col('agent2').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 4).alias('p2_bounds')
    ).drop(
        ['GameRulesetName', 'EnglishRules', 'LudRules', 
         'num_wins_agent1', 'num_draws_agent1',
         'num_losses_agent1', 'utility_agent1'],
        strict=False
    ).to_pandas()

    if 'utility_agent1' in df_polars.columns: # Processing the training data
        # Extract the target
        target = df_polars.select('utility_agent1').to_numpy().ravel()

        # Extract the groups for the GroupKFold
        groups = df_polars.select('GameRulesetName').to_numpy()
        
        # Set the mapping to categorical dtypes
        cat_mapping = {feature: pd.CategoricalDtype(categories=list(set(df[feature]))) for feature in df.columns[df.dtypes == object]}
    else: # Processing the test data
        target, groups = None, None
        
    # Convert the strings to categorical
    df = df.astype(cat_mapping)

    return df, target, groups

Overwriting preprocess.py


In [6]:
exec(open('preprocess.py', 'r').read())
train_pd, y, groups = preprocess(train)

In [7]:
crossval_kf = GroupKFold()
folds = list(crossval_kf.split(train_pd, groups=train.select('GameRulesetName')))
    
def cross_validate_model(model, features=train_pd.columns, label='', save_models=False):
    global oof
    start_time = datetime.datetime.now()
    oof = np.full_like(y, np.nan)
    model_list = []
    for fold, (idx_tr, idx_va) in enumerate(folds):
        X_tr = train_pd[features].iloc[idx_tr]
        X_va = train_pd[features].iloc[idx_va]
        y_tr = y[idx_tr]
        y_va = y[idx_va]

        m = clone(model)
        m.fit(X_tr, y_tr)
        y_pred = m.predict(X_va).clip(-1, 1)
        if save_models:
            model_list.append(m)
        del m
        oof[idx_va] = y_pred
        # rmse = mean_squared_error(y_va, y_pred, squared=False)
        rmse = root_mean_squared_error(y_va, y_pred)
        print(f"# Fold {fold}: {rmse:=.3f}")
        
    elapsed_time = datetime.datetime.now() - start_time
    # rmse = mean_squared_error(y, oof, squared=False)
    rmse = root_mean_squared_error(y, oof)
    print(f"{Fore.GREEN}# Overall RMSE={rmse:.3f} {label}"
          f"   {int(np.round(elapsed_time.total_seconds() / 60))} min{Style.RESET_ALL}")
    if save_models:
        saved_models[label] = dict(features=features, model_list=model_list)
        oof_pred[label] = oof


In [8]:
%%time

# All the game features (concepts) have a ComputationTypeId, which is either 'Compiler' or 'Simulation'
concepts = pd.read_csv('datasets/concepts.csv', index_col='Id')
concepts[['TypeId', 'DataTypeId', 'ComputationTypeId', 'LeafNode', 'ShowOnWebsite']] = concepts[['TypeId', 'DataTypeId', 'ComputationTypeId', 'LeafNode', 'ShowOnWebsite']].astype(int)
concepts.replace({'ComputationTypeId': {1: 'Compiler', 2: 'Simulation'}}, inplace=True)
# print(concepts.ComputationTypeId.value_counts())

features = [f for f in train_pd.columns if f not in ['agent1', 'agent2']]
X = train_pd[features].copy()
X['p_selection'] = (X.p1_selection.astype(str) + '-' + X.p2_selection.astype(str)).astype('category')
X['p_exploration'] = X.p1_exploration - X.p2_exploration
X['p_playout'] = (X.p1_playout.astype(str) + '-' + X.p2_playout.astype(str)).astype('category')
X['p_bounds'] = (X.p1_bounds.astype(str) + '-' + X.p2_bounds.astype(str)).astype('category')
display(X.head(3))

lgbm_params_fast = {'learning_rate': 0.2, 'colsample_bytree': 0.7, 'verbose': 0}
model = lightgbm.LGBMRegressor(**lgbm_params_fast)
kf = GroupShuffleSplit(n_splits=5, random_state=1)
for fold, (idx_tr, idx_va) in enumerate(kf.split(train_pd, groups=groups)):
    X_tr = X.iloc[idx_tr]
    X_va = X.iloc[idx_va]
    y_tr = y[idx_tr]
    y_va = y[idx_va]
#     model.fit(X_tr, y_tr, eval_set=(X_va, y_va), eval_metric='rmse', callbacks=[lightgbm.log_evaluation()])
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_va)
    rmse = mean_squared_error(y_va, y_pred, squared=False)
    
    result = permutation_importance(model, X_va, y_va, scoring='neg_root_mean_squared_error', n_repeats=2)
    
    print(f"{Fore.GREEN}{Style.BRIGHT}Important features: {(result['importances_mean'] > 0).mean():.0%}   ({rmse=:.3f}){Style.RESET_ALL}")
    importance_df = pd.DataFrame({'importance': result['importances_mean'],
                        'std': result['importances_std']}, index=X_va.columns).sort_values('importance', ascending=False)
    importance_df['ComputationTypeId'] = concepts.set_index('Name').ComputationTypeId
    importance_df.fillna({'ComputationTypeId': 'Player'}, inplace=True)
    display(importance_df.head(50))
    print()
    break
    
# Keep the good features for later
good_features = list(importance_df.query("importance > 0").index)
good_features = [f for f in good_features if f not in ['p_selection', 'p_exploration', 'p_playout', 'p_bounds']]

# 10 minutes

Unnamed: 0,Stochastic,Asymmetric,AsymmetricForces,AsymmetricPiecesType,PlayersWithDirections,Cooperation,Team,Shape,SquareShape,HexShape,...,p1_playout,p1_bounds,p2_selection,p2_exploration,p2_playout,p2_bounds,p_selection,p_exploration,p_playout,p_bounds
0,0,0,0,0,0,0,0,1,0,0,...,MAST,False,ProgressiveHistory,0.6,Random200,False,ProgressiveHistory-ProgressiveHistory,-0.5,MAST-Random200,false-false
1,0,0,0,0,0,0,0,1,0,0,...,MAST,False,UCB1GRAVE,0.6,NST,True,ProgressiveHistory-UCB1GRAVE,-0.5,MAST-NST,false-true
2,0,0,0,0,0,0,0,1,0,0,...,MAST,True,UCB1,0.1,NST,False,ProgressiveHistory-UCB1,0.0,MAST-NST,true-false




[32m[1mImportant features: 40%   (rmse=0.446)[0m


Unnamed: 0,importance,std,ComputationTypeId
AdvantageP1,0.163091,0.001489434,Simulation
p_selection,0.037106,0.0005239905,Player
p_playout,0.026058,0.000490664,Player
p2_selection,0.015543,0.000238037,Player
p1_exploration,0.01508,8.896647e-05,Player
p2_exploration,0.013947,0.0001328256,Player
p_exploration,0.010905,0.0001719952,Player
PlayoutsPerSecond,0.008563,6.422172e-05,Simulation
OutcomeUniformity,0.007853,7.178726e-05,Simulation
p1_selection,0.007432,0.0001983472,Player



CPU times: user 15min 33s, sys: 3.66 s, total: 15min 37s
Wall time: 3min 29s


In [9]:
# cb_params = {'grow_policy': 'SymmetricTree', 
#              'n_estimators': 800, 
#              'learning_rate': 0.08617153230342124, 
#              'l2_leaf_reg': 1.0036132233587023, 
#              'max_depth': 10, 
#              'colsample_bylevel': 0.734514897063923, 
#              'subsample': 0.994540769511675, 
#              'random_strength': 0.5393480589423867, 
#              'verbose': False}
# model = catboost.CatBoostRegressor(**cb_params, cat_features= train_pd[good_features].columns[train_pd[good_features].dtypes == 'category'].values)
# cross_validate_model(model, features=good_features, label=f"CatBoost", save_models=True)


In [64]:
exec(open('DDPM.py', 'r').read())

In [55]:
kf = GroupShuffleSplit(n_splits=5, random_state=1)
oof_predictions = np.zeros_like(y)
models = []
rmse_scores = []

for fold, (idx_tr, idx_va) in enumerate(kf.split(train_pd, groups=groups)):
    print(f"\nFold {fold + 1}/5")
    print("-" * 50)
    X_tr = train_pd.iloc[idx_tr]
    X_va = train_pd.iloc[idx_va]
    y_tr = y[idx_tr]
    y_va = y[idx_va]
    X_train_processed, X_val_processed,_ = prepare_data(X_tr, y_tr, good_features, val_pd=X_va)
    print(X_train_processed[0])
    break




Fold 1/5
--------------------------------------------------
Final feature dimension: 245
Numerical features: 233
One-hot encoded features: 12
[-8.68539385e-02 -1.11658608e+00 -1.94434868e-01 -2.54796814e-01
 -8.17215535e-01 -6.86522632e-01 -7.82939801e-01  7.19844932e-02
  5.97947713e-01 -8.65652081e-01 -4.88440335e-01 -3.00534112e-01
 -6.47426075e-01 -4.63853804e-01  1.71008189e-01  5.19172942e-01
 -6.94064146e-01 -4.84283315e-01 -4.15705576e-01 -5.51430165e-03
 -5.97016031e-02  1.45577056e-01 -4.58098922e-02 -1.93830494e-01
 -4.80429664e-01 -3.19020148e-02  7.38969544e-03 -1.44789224e-01
  2.12808866e+00 -3.03980860e-01  7.20563240e-01 -4.25650282e-02
  1.56020356e+00 -2.45705505e-01 -4.36082482e-02 -6.76305708e-02
 -1.26578173e+00 -7.54916199e-01 -2.85642219e-01 -3.21832663e-02
  1.63165036e+00  5.00260840e-01  2.08736394e-02 -3.27674587e-02
 -3.82369434e-01 -4.43589413e-03 -1.52146416e-01  1.30869910e+00
  4.79142796e-01 -1.63821876e-02 -9.69807447e-02 -8.61980677e-01
 -9.11901118

In [66]:

import torch

model_params = {
    'hidden_dim': 1024
}


train_params = {
    'batch_size': 4096,
    'n_steps': 200,
    'n_epochs': 200,
    'learning_rate': 1e-3,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

kf = GroupShuffleSplit(n_splits=5, random_state=1)

# 存储每个fold的结果
oof_predictions = np.zeros_like(y)
models = []
rmse_scores = []

for fold, (idx_tr, idx_va) in enumerate(kf.split(train_pd, groups=groups)):
    print(f"\nFold {fold + 1}/5")
    print("-" * 50)
    
    # 准备当前fold的数据
    X_tr = train_pd.iloc[idx_tr]
    X_va = train_pd.iloc[idx_va]
    y_tr = y[idx_tr]
    y_va = y[idx_va]
    

    X_train_processed, X_val_processed,_ = prepare_data(X_tr, y_tr, good_features, val_pd=X_va)

    
    # 训练模型
    model, trainer, val_rmse = train_diffusion(
        X_train_processed, 
        y_tr,
        X_val_processed,
        y_va,
        model_params,
        train_params
    )

    val_pred = predict_diffusion(model, trainer, X_val_processed, device='cuda')
    val_pred = val_pred.clip(-1, 1) 
    
    oof_predictions[idx_va] = val_pred
    models.append(model)
    rmse_scores.append(val_rmse)
    
    fold_rmse = np.sqrt(mean_squared_error(y_va, val_pred))
    print(f"Fold {fold + 1} Final RMSE: {fold_rmse:.3f}")

# 计算总体RMSE
overall_rmse = np.sqrt(mean_squared_error(y, oof_predictions))
print(f"\nOverall RMSE: {overall_rmse:.3f}")




Fold 1/5
--------------------------------------------------
