In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
datasnaek_chess_path = kagglehub.dataset_download('datasnaek/chess')

print('Data source import complete.')


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
# !conda install -c rapidsai -c nvidia -c conda-forge -c defaults cuml=23.2 python=3.10 cudatoolkit=11.8 -y
# !pip install xgboost --upgrade
from xgboost import XGBRegressor

print('Data source import complete.')


In [None]:
import kagglehub
import numpy as np
import xgboost as xgb
import cupy as cp
import cudf as cd
import datetime
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from cuml.preprocessing import StandardScaler
from cuml.preprocessing import LabelEncoder
from dask_ml.model_selection import GridSearchCV
from cuml.ensemble import RandomForestRegressor
from cuml.model_selection import train_test_split
from cuml.metrics import mean_squared_error, r2_score
# Download latest version

data_path = "/kaggle/input/chess/games.csv"
print(data_path)

In [None]:
df = cd.read_csv(data_path)

In [None]:
df.info()

# data

In [None]:
df.drop_duplicates().head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df = df.astype({col: 'int' for col in df.select_dtypes(bool).columns})
df['rated'].head(5)

In [None]:
for i in df.select_dtypes(include=cp.number).columns:
    skewness = df[i].skew()
    print(f"Skewness {i} adalah {skewness}")

In [None]:
df.info()

# Feature Enginering

In [None]:
df['created_at'] = cd.to_datetime(df['created_at'], unit='s')
df['last_move_at'] = cd.to_datetime(df['last_move_at'], unit='s')

In [None]:
df['game_duration'] = (df['last_move_at'] - df['created_at']).dt.total_seconds()

In [None]:
df['hour_of_day'] = df['created_at'].dt.hour
df['day_of_week'] = df['created_at'].dt.day_name()

In [None]:
df['rating_diff'] = df['white_rating'] - df['black_rating']
df['avg_rating'] = (df['white_rating'] + df['black_rating'])/2

In [None]:
df['base_time'] = df['increment_code'].str.extract(r'^(\d+)\+').astype('int')
df['increment_time'] = df['increment_code'].str.extract(r'\+(\d+)$').astype('int')

In [None]:
df['moves_list'] = df['moves'].str.split()
df['num_moves'] = df['moves_list'].list.len()

In [None]:
df['victory_status'].unique()

In [None]:
df['winner'] = df['winner'].map({'white': 0, 'black': 1, 'draw': 2})
df['victory_status'] = df['victory_status'].map({'outoftime': 0, 'resign': 1, 'mate': 2, 'draw': 3})

In [None]:
df['opening_eco_encode'] = cd.factorize(df['opening_eco'])[0]
df['opening_name_encode'] = cd.factorize(df['opening_name'])[0]

In [None]:
df.drop(columns=['created_at', 'last_move_at', 'moves', 'opening_eco', 'opening_name', 'increment_code'], inplace=True)

In [None]:
df.info()

In [None]:
df['is_weekend'] = df['day_of_week'].isin(['Saturday', 'Sunday']).astype(int)
df['is_night_game'] = (df['hour_of_day'] >= 20).astype(int)

In [None]:
df['white_rating_category'] = (
    (df['white_rating'] <= 1200) * 0 +
    ((df['white_rating'] > 1200) & (df['white_rating'] <= 1600)) * 1 +
    ((df['white_rating'] > 1600) & (df['white_rating'] <= 2000)) * 2 +
    (df['white_rating'] > 2000) * 3
)

df['black_rating_category'] = (
    (df['black_rating'] <= 1200) * 0 +
    ((df['black_rating'] > 1200) & (df['black_rating'] <= 1600)) * 1 +
    ((df['black_rating'] > 1600) & (df['black_rating'] <= 2000)) * 2 +
    (df['black_rating'] > 2000) * 3
)

In [None]:
df['first_move_white'] = df['moves_list'].list.get(0)
df['first_move_black'] = df['moves_list'].list.get(1)

df.drop(columns=['moves_list'], inplace=True)

In [None]:
df.select_dtypes(exclude=np.number).info()

In [None]:
labelencoder = LabelEncoder()
df['day_of_week'] = labelencoder.fit_transform(df['day_of_week'])

In [None]:
mean_turns = df.groupby('white_id')['turns'].mean()
df['white_id'] = df['white_id'].map(mean_turns)

mean_turns = df.groupby('black_id')['turns'].mean()
df['black_id'] = df['black_id'].map(mean_turns)

In [None]:
mean_turns_white = df.groupby('first_move_white')['turns'].mean()
df['first_move_white'] = df['first_move_white'].map(mean_turns_white)

mean_turns_black = df.groupby('first_move_black')['turns'].mean()
df['first_move_black'] = df['first_move_black'].map(mean_turns_black)

In [None]:
mean_turns_black = df.groupby('id')['turns'].mean()
df['id'] = df['id'].map(mean_turns_black)

In [None]:
# df.drop(columns=[''], inplace=True)


In [None]:
df.info()

# modeling

In [None]:
X = df.drop(columns=['turns'])
y = df['turns']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Menghitung ANOVA secara manual (f_classif mirip)
X_cd = cd.DataFrame(X_train_scaled)
y_cd = cd.Series(y_train)

# Contoh menghitung varians untuk setiap fitur
f_scores = {}
for column in X_cd.columns:
    group_means = X_cd.groupby(y_cd)[column].mean()
    overall_mean = X_cd[column].mean()
    f_score = ((group_means - overall_mean) ** 2).sum()
    f_scores[column] = f_score

# Pilih K fitur terbaik berdasarkan skor tertinggi
selected_features = sorted(f_scores.keys(), key=lambda k: f_scores[k], reverse=True)[:K]
X_selected = X_cd[selected_features]

print("Best Parameters:", X_selected)

In [None]:
best_model = 
print("Best Hyperparameters:", )

# Final evaluation on test data
final_preds = best_model.predict(X_test_scaled)
final_rmse = mean_squared_error(y_test, final_preds, squared=False)
print(f"Final RMSE: {final_rmse}")

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
  y_true, y_pred = np.array(y_true), np.array(y_pred)
  return np.mean(np.abs((y_true - y_pred)/y_true)) * 100

In [None]:
def predict_perform(X_train, X_test, y_train, y_test):
    # Prediksi
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    r2 = r2_score(y_test, final_preds)


    # Konversi CuPy ke NumPy jika diperlukan (untuk semua variabel)
    if isinstance(y_train, cp.ndarray):
        y_train = y_train.get()
    if isinstance(y_train_pred, cp.ndarray):
        y_train_pred = y_train_pred.get()
    if isinstance(y_test, cp.ndarray):
        y_test = y_test.get()
    if isinstance(y_test_pred, cp.ndarray):
        y_test_pred = y_test_pred.get()

    # Evaluasi
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)
    MAPE = mean_absolute_percentage_error(y_test, y_test_pred)

    print(f"🔹 R²: {r2:.4f}")
    print(f"🔹 R² Train : {r2_train:.3f}")
    print(f"🔹 R² Test  : {r2_test:.3f}")
    print(f"🔹 MAPE     : {MAPE:.3f}%")

predict_perform(X_train_scaled, X_test_scaled, y_train, y_test)