In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,ShuffleSplit,GridSearchCV,cross_val_score
from sklearn.metrics import mean_squared_error,r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn import model_selection

In [2]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[["League","Division","NewLeague"]])
y = df["Salary"]
X_ = df.drop(["Salary","League","Division","NewLeague"],axis = 1).astype("float64")
X = pd.concat([X_,dms[["League_N","Division_W","NewLeague_N"]]],axis = 1)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 42)

In [3]:
from catboost import CatBoostRegressor
catb = CatBoostRegressor()
catb_model = catb.fit(X_train,y_train)

Learning rate set to 0.031674
0:	learn: 437.6430699	total: 126ms	remaining: 2m 5s
1:	learn: 431.3923642	total: 127ms	remaining: 1m 3s
2:	learn: 424.8820360	total: 129ms	remaining: 42.8s
3:	learn: 418.2514904	total: 130ms	remaining: 32.4s
4:	learn: 412.6394021	total: 132ms	remaining: 26.2s
5:	learn: 406.6247020	total: 133ms	remaining: 22s
6:	learn: 400.5321206	total: 134ms	remaining: 19s
7:	learn: 394.6683437	total: 135ms	remaining: 16.8s
8:	learn: 388.2496484	total: 137ms	remaining: 15s
9:	learn: 382.9448842	total: 138ms	remaining: 13.6s
10:	learn: 377.2600080	total: 139ms	remaining: 12.5s
11:	learn: 372.4829606	total: 140ms	remaining: 11.6s
12:	learn: 366.6823437	total: 142ms	remaining: 10.8s
13:	learn: 362.6076230	total: 143ms	remaining: 10.1s
14:	learn: 358.0107745	total: 144ms	remaining: 9.46s
15:	learn: 353.2802665	total: 145ms	remaining: 8.94s
16:	learn: 348.5646265	total: 147ms	remaining: 8.47s
17:	learn: 343.6407912	total: 148ms	remaining: 8.06s
18:	learn: 339.2363847	total: 14

# Tahmin

In [4]:
y_pred = catb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test,y_pred))

351.194631344607

# Model Tuning

In [6]:
catb_grid = {
    "iterations": [200,500,1000,2000],
    "learning_rate": [0.01,0.03,0.05,0.1],
    "depth" : [3,5,6,7,8]
}

In [8]:
catb = CatBoostRegressor()
catb_cv_model = GridSearchCV(catb,catb_grid,cv = 5,n_jobs = -1,verbose = 2)

In [9]:
catb_cv_model.fit(X_train,y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
0:	learn: 422.4143448	total: 1.2ms	remaining: 1.2s
1:	learn: 404.1864276	total: 2.34ms	remaining: 1.17s
2:	learn: 386.3231718	total: 3.35ms	remaining: 1.11s
3:	learn: 370.5548032	total: 4.34ms	remaining: 1.08s
4:	learn: 354.9242038	total: 5.34ms	remaining: 1.06s
5:	learn: 342.3403984	total: 6.27ms	remaining: 1.04s
6:	learn: 328.2370070	total: 7.26ms	remaining: 1.03s
7:	learn: 317.5056526	total: 8.28ms	remaining: 1.03s
8:	learn: 306.6243511	total: 9.28ms	remaining: 1.02s
9:	learn: 297.3147023	total: 10.2ms	remaining: 1.01s
10:	learn: 288.3685892	total: 11.2ms	remaining: 1.01s
11:	learn: 281.0996220	total: 12.2ms	remaining: 1s
12:	learn: 273.2254898	total: 13.2ms	remaining: 1s
13:	learn: 266.9003385	total: 14.2ms	remaining: 1s
14:	learn: 261.9092500	total: 15.2ms	remaining: 998ms
15:	learn: 256.2637350	total: 16.2ms	remaining: 995ms
16:	learn: 250.3667935	total: 17.1ms	remaining: 991ms
17:	learn: 244.8631098	total: 18.1ms	rema

GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostRegressor object at 0x0000027D5D757820>,
             n_jobs=-1,
             param_grid={'depth': [3, 5, 6, 7, 8],
                         'iterations': [200, 500, 1000, 2000],
                         'learning_rate': [0.01, 0.03, 0.05, 0.1]},
             verbose=2)

In [10]:
catb_cv_model.best_params_

{'depth': 5, 'iterations': 1000, 'learning_rate': 0.1}

In [11]:
catb_tuned = CatBoostRegressor(iterations = 1000,learning_rate = 0.1,depth = 5)
catb_tuned = catb_tuned.fit(X_train,y_train)

0:	learn: 422.4143448	total: 2.11ms	remaining: 2.11s
1:	learn: 404.1864276	total: 4.07ms	remaining: 2.03s
2:	learn: 386.3231718	total: 6.22ms	remaining: 2.07s
3:	learn: 370.5548032	total: 8.33ms	remaining: 2.07s
4:	learn: 354.9242038	total: 10.5ms	remaining: 2.09s
5:	learn: 342.3403984	total: 13.3ms	remaining: 2.21s
6:	learn: 328.2370070	total: 15.5ms	remaining: 2.2s
7:	learn: 317.5056526	total: 17.6ms	remaining: 2.18s
8:	learn: 306.6243511	total: 19.3ms	remaining: 2.12s
9:	learn: 297.3147023	total: 20.8ms	remaining: 2.06s
10:	learn: 288.3685892	total: 22.2ms	remaining: 2s
11:	learn: 281.0996220	total: 23.6ms	remaining: 1.94s
12:	learn: 273.2254898	total: 25.3ms	remaining: 1.92s
13:	learn: 266.9003385	total: 27ms	remaining: 1.9s
14:	learn: 261.9092500	total: 28.8ms	remaining: 1.89s
15:	learn: 256.2637350	total: 30.5ms	remaining: 1.88s
16:	learn: 250.3667935	total: 31.9ms	remaining: 1.84s
17:	learn: 244.8631098	total: 33.2ms	remaining: 1.81s
18:	learn: 240.1540669	total: 34.4ms	remainin

In [12]:
y_pred = catb_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test,y_pred))

356.665762904938