In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[["League", "Division", "NewLeague"]])

y = df["Salary"]
X_ = df.drop(["Salary", "League", "Division", "NewLeague"], axis=1).astype("float64")
X = pd.concat([X_, dms[["League_N", "Division_W", "NewLeague_N"]]], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 25, random_state = 42)

In [3]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.5-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5


In [4]:
import xgboost as xgb

In [5]:
# xgboost'u performanslı kullanmak için kendi değişkenleri üzerinden kullanmak gerekiyor
DM_train = xgb.DMatrix(data = X_train, label = y_train)
DM_test = xgb.DMatrix(data = X_test, label = y_test)

In [6]:
from xgboost import XGBRegressor

In [8]:
xgb = XGBRegressor().fit(X_train, y_train)

## Tahmin

In [9]:
y_pred = xgb.predict(X_test)

print("Tahmin hatası: ",np.sqrt(mean_squared_error(y_test, y_pred)))

Tahmin hatası:  329.43195380236057


## Model Tuning

In [11]:
xgb_grid = {
    "colsample_bytree": [0.4, 0.5, 0.6, 0.9, 1],
    "n_estimators": [100, 200, 500, 1000],
    "max_depth": [2,3,4,5,6],
    "learning_rate": [0.1, 0.01, 0.5]
}

In [14]:
xgb = XGBRegressor()

xgb_cv = GridSearchCV(xgb, 
                      param_grid = xgb_grid, 
                      cv = 10, 
                      n_jobs = -1, 
                      verbose = 2)

xgb_cv.fit(X_train, y_train)

Fitting 10 folds for each of 300 candidates, totalling 3000 fits
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=500; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=1000; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=500; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=500; total time=   0.1s
[CV] END colsample_bytree=0.4, lea

[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=500; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=500; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=1000; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=500; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=500; total time=   0.1s

[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=500; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=1000; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=500; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=500; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=4, n_estimators=100; total time=   0.0s

[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=2, n_estimators=200; total time=   0.0s


[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=1000; total time=   0.2s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=1000; total t

[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=500; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=1000; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=1000; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=1000; total time=   0.2s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=1000; total time=   0.2s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=4, n_estimators=1000; total time=   0.2s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=4, n_estimators=1000; total time=   0.3s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=4, n_estimators=1000; total time=   0.2s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=4, n_estimators=1000; total time=

[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=500; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=1000; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=1000; total time=   0.2s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=1000; total time=   0.

[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=6, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=6, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=6, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=6, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=6, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=6, n_estimators=1000; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=6, n_estimators=1000; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=6, n_estimators=1000; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.5, max_depth=6, n_estimators=1000; total time=   0.1s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0

In [15]:
xgb_cv.best_params_

{'colsample_bytree': 0.5,
 'learning_rate': 0.1,
 'max_depth': 2,
 'n_estimators': 500}

In [17]:
# En iyi değerler ile model oluşturma
xgb_tuned = XGBRegressor(learning_rate = 0.1, 
                         max_depth = 2,
                         n_estimators = 500,
                         colsample_bytree = 0.5)

xgb_tuned = xgb_tuned.fit(X_train, y_train)

In [19]:
y_pred = xgb_tuned.predict(X_test)

print("Tahmin hatası: ",np.sqrt(mean_squared_error(y_test, y_pred)))

Tahmin hatası:  304.3503871237262
