In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
# !conda install -c rapidsai -c nvidia -c conda-forge -c defaults cuml=23.2 python=3.10 cudatoolkit=11.8 -y
# !pip install xgboost --upgrade
from xgboost import XGBRegressor

print('Data source import complete.')


Pakai kalau kamu punya label (target) yang jelas dan ingin memprediksi sesuatu.

Contoh target yang bisa dipakai dari dataset ini:

Memprediksi siapa yang menang (Winner)
→ fitur: rating putih/hitam, opening, jumlah langkah, dll
→ model: klasifikasi (White, Black, Draw)

Memprediksi apakah game rated atau tidak (Rated)
→ fitur: rating, status, opening, durasi
→ model: klasifikasi (binary)

Memprediksi jumlah langkah (Number of Turns)
→ model: regresi

📌 Model yang cocok:

Random Forest, XGBoost, Logistic Regression, SVM, dll.

In [None]:
import kagglehub
import numpy as np
import xgboost as xgb
import cupy as cp
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
# Download latest version

data_path = "/kaggle/input/chess/games.csv"
print(data_path)

In [None]:
df = pd.read_csv(data_path)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
corr_matrix = df.select_dtypes(include=np.number).corr()

sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")

In [None]:
df.duplicated().sum()

In [None]:
df.select_dtypes(include=np.number).describe()

In [None]:
df.select_dtypes(include='object').describe()

In [None]:
df.isnull().sum()

In [None]:
chess_numerical = df.select_dtypes(include=np.number)
chess_categorical = df.select_dtypes(include='object')

for i in chess_numerical:
  sns.histplot(chess_numerical, x=i)
  plt.show()

In [None]:
for i in chess_numerical:
  skewness = df[i].skew()
  print(f"Skewness of {i}: {skewness:.2f}")

# feature Enginering

In [None]:
df['duration'] = df['last_move_at'] - df['created_at']
q99 = df['duration'].quantile(0.99)
df['duration_cap'] = np.where(df['duration']> q99, q99, df['duration'])
df['duration_log'] = np.log1p(df['duration_cap'])
df['duration'] = df['duration_log']
df.drop(columns=['duration_cap', 'duration_log', 'created_at', 'last_move_at'], inplace=True)
chess_numerical = df.select_dtypes(include=np.number)

for i in chess_numerical:
  skewness = df[i].skew()
  print(f"Skewness of {i}: {skewness:.2f}")

In [None]:
df['opening_ply'] = np.log1p(df['opening_ply'])
df['opening_ply'].skew()
chess_numerical = df.select_dtypes(include=np.number)

for i in chess_numerical:
  skewness = df[i].skew()
  print(f"Skewness of {i}: {skewness:.2f}")

In [None]:
df[['initial_time', 'increment_time']] = df['increment_code'].str.split('+', expand=True).astype(int)
# df = df.drop('increment_code', axis=1)


In [None]:
df['rating_gap'] = df['white_rating'] - df['black_rating']
# df.drop(columns=['white_rating', 'black_rating'], inplace=True)

In [None]:
df.info()

In [None]:
df = df.astype({col: 'int' for col in df.select_dtypes('bool').columns})
df.info()

In [None]:
chess_categorical = df.select_dtypes(include='object')
for i in chess_categorical:
  print(f""" Unique values of {i}:
  {df[i].unique()}""")

In [None]:
encoder = LabelEncoder()

for col in df.select_dtypes(include='object').columns:
    df[col] = encoder.fit_transform(df[col])

df.info()

In [None]:
for i in df:
  skewness = df[i].skew()
  print(f"Skewness of {i}: {skewness:.2f}")

In [None]:
# Contoh hasil skewness (ganti ini dengan fungsi skewness dari DataFrame asli)
skewness_values = {
    'id': 0.00,
    'rated': -1.54,
    'created_at': -1.78,
    'last_move_at': -1.78,
    'turns': 0.90,
    'victory_status': -0.56,
    'winner': -0.09,
    'increment_code': 0.81,
    'white_id': 0.03,
    'white_rating': 0.30,
    'black_id': 0.02,
    'black_rating': 0.26,
    'moves': -0.00,
    'opening_eco': 0.01,
    'opening_name': -0.09,
    'opening_ply': -0.01,
    'duration': -0.24,
    'initial_time': 7.77,
    'increment_time': 8.88,
    'rating_gap': 0.08
}

# Perulangan untuk menangani skewness
for col, skewness in skewness_values.items():
    if col in df.columns:  # Pastikan kolom ada di DataFrame
        if skewness > 0.5:  # Skewness positif tinggi
            print(f"Applying log transform to {col} (skewness: {skewness:.2f})")
            df[col] = np.log1p(df[col])  # log1p untuk menghindari log(0)
        elif skewness < -0.5:  # Skewness negatif tinggi
            print(f"Applying exponential transform to {col} (skewness: {skewness:.2f})")
            df[col] = np.exp(df[col])  # Exponential transform
        else:
            print(f"No transform needed for {col} (skewness: {skewness:.2f})")

In [None]:
for i in df:
  skewness = df[i].skew()
  print(f"Skewness of {i}: {skewness:.2f}")

In [None]:
X = df.drop('turns', axis=1)
y = df['turns']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Pindahkan data ke GPU menggunakan CuPy
# X_train_gpu = cp.array(X_train_scaled)
# y_train_gpu = cp.array(y_train)
# X_test_gpu = cp.array(X_test_scaled)

# # Model dengan GPU
# model = XGBRegressor(tree_method="hist", device="cuda")

model = RandomForestRegressor(n_estimators=100,max_depth=None,random_state=42,n_jobs=-1)

model.fit(X_train_scaled, y_train)

# Prediksi
y_pred = model.predict(X_test_scaled)

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           scoring='r2',
                           cv=5,
                           verbose=1)
grid_search.fit(X_train_scaled, y_train)
print("Best Parameters:", grid_search.best_params_)

In [None]:
best_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)

# Final evaluation on test data
final_preds = best_model.predict(X_test_scaled)
final_rmse = mean_squared_error(y_test, final_preds, squared=False)
print(f"Final RMSE: {final_rmse}")

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
  y_true, y_pred = np.array(y_true), np.array(y_pred)
  return np.mean(np.abs((y_true - y_pred)/y_true)) * 100

In [None]:
def predict_perform(X_train, X_test, y_train, y_test):
    # Prediksi
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    

    # Konversi CuPy ke NumPy jika diperlukan (untuk semua variabel)
    if isinstance(y_train, cp.ndarray):
        y_train = y_train.get()
    if isinstance(y_train_pred, cp.ndarray):
        y_train_pred = y_train_pred.get()
    if isinstance(y_test, cp.ndarray):
        y_test = y_test.get()
    if isinstance(y_test_pred, cp.ndarray):
        y_test_pred = y_test_pred.get()

    # Evaluasi
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)
    MAPE = mean_absolute_percentage_error(y_test, y_test_pred)

    print(f"🔹 R²: {r2:.4f}")
    print(f"🔹 R² Train : {r2_train:.3f}")
    print(f"🔹 R² Test  : {r2_test:.3f}")
    print(f"🔹 MAPE     : {MAPE:.3f}%")

predict_perform(X_train_scaled, X_test_scaled, y_train, y_test)