In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[["League", "Division", "NewLeague"]])

y = df["Salary"]
X_ = df.drop(["Salary", "League", "Division", "NewLeague"], axis=1).astype("float64")
X = pd.concat([X_, dms[["League_N", "Division_W", "NewLeague_N"]]], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 25, random_state = 42)

In [8]:
#conda install -c conda-forge lightgbm

Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/batuhanbilgili/opt/anaconda3

  added / updated specs:
    - lightgbm


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2022.12.7  |       h4653dfc_0         142 KB  conda-forge
    certifi-2022.12.7          |     pyhd8ed1ab_0         147 KB  conda-forge
    lightgbm-3.3.5             |   py39h313beb8_0         978 KB
    openssl-1.1.1t             |       h03a7124_0         1.5 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.8 MB

The following NEW packages will be INSTALLED:

  lightgbm           pkgs/main/osx-arm64::lightgbm-3.3.5-py39h313beb8_0 

The following packages will be SUPERSEDED by a higher-prio

In [3]:
from lightgbm import LGBMRegressor

In [4]:
lgbm = LGBMRegressor()

lgbm.fit(X_train, y_train)

## Tahmin

In [6]:
y_pred = lgbm.predict(X_test,
                     num_iteration = lgbm.best_iteration_)

In [7]:
print("Tahmin hatası: ",np.sqrt(mean_squared_error(y_test, y_pred)))

Tahmin hatası:  275.676322780286


## Model Tuning

In [10]:
lgbm_grid = {
    'learning_rate': [0.01, 0.1, 0.5, 1], 
    'max_depth': [1,2,3,4,5,6,7,8],
    'n_estimators': [20, 40, 100, 200, 500, 1000]}

lgmb = LGBMRegressor()
lgbm_cv = GridSearchCV(lgmb, lgbm_grid, cv=10, n_jobs = -1, verbose = 2)

In [11]:
lgbm_cv.fit(X_train, y_train)

Fitting 10 folds for each of 192 candidates, totalling 1920 fits


In [12]:
lgbm_cv.best_params_

{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 1000}

In [13]:
lgbm_tuned = LGBMRegressor(learning_rate = 0.01,
                          max_depth = 2,
                          n_estimators = 1000)
lgbm_tuned.fit(X_train, y_train)

In [14]:
y_pred = lgbm_tuned.predict(X_test)

print("Tahmin hatası: ",np.sqrt(mean_squared_error(y_test, y_pred)))

Tahmin hatası:  286.9430670495092
