In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [2]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)

In [None]:
#%pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/150.0 MB 1.1 MB/s eta 0:02:14
   ---------------------------------------- 0.8/150.0 MB 1.0 MB/s eta 0:02:23
   ---------------------------------------- 0.8/150.0 MB 1.0 MB/s eta 0:02:23
   ---------------------------------------- 1.0/150.0 MB 967.3 kB/s eta 0:02:34
   ---------------------------------------- 1.3/150.0 MB 944.7 kB/s eta 0:02:38
   ---------------------------------------- 1.3/150.0 MB 944.7 kB/s eta 0:02:38
   ---------------------------------------- 1.6/150.0 MB 942.3 kB/s eta 0:02:38
   ---------------------------------------- 1.8/150.0 MB 940.7 kB/s eta 0:02:38
   --

In [4]:
import xgboost as xgb
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)

In [7]:
from xgboost import XGBRegressor
xgb_model = XGBRegressor().fit(X_train, y_train)

In [8]:
y_pred = xgb_model.predict(X_test)

In [9]:
np.sqrt(mean_squared_error(y_test, y_pred))

366.3863437634965

MODEL TUNING

In [10]:
xgb_grid = {
     'colsample_bytree': [0.4, 0.5,0.6,0.9,1], 
     'n_estimators':[100, 200, 500, 1000],
     'max_depth': [2,3,4,5,6],
     'learning_rate': [0.1, 0.01, 0.5]
}


In [11]:
xgb_model = XGBRegressor()

In [12]:
xgb_cv_model = GridSearchCV(xgb_model,
                           param_grid=xgb_grid,
                           n_jobs=-1,
                           cv=10,
                           verbose=2)

In [13]:
xgb_cv_model.fit(X_train, y_train)

Fitting 10 folds for each of 300 candidates, totalling 3000 fits


In [14]:
xgb_cv_model.best_params_

{'colsample_bytree': 0.4,
 'learning_rate': 0.1,
 'max_depth': 6,
 'n_estimators': 100}

In [20]:
xgb_tuned = XGBRegressor(learning_rate=xgb_cv_model.best_params_['learning_rate'],
                         max_depth=xgb_cv_model.best_params_['max_depth'],
                        n_estimators=xgb_cv_model.best_params_['n_estimators'],
                        colsample_bytree=xgb_cv_model.best_params_['colsample_bytree']).fit(X_train, y_train)

In [21]:
y_pred_tuned = xgb_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred_tuned))

343.6101991802883