In [31]:
import pandas as pd
import numpy as np
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

#### To load a LibSVM (zero-based) text file or a LightGBM binary file into Dataset:

In [32]:
train_data = lgb.Dataset('train.svm.bin')

In [33]:
train_data

<lightgbm.basic.Dataset at 0x20a5badfbc8>

#### To load a numpy array into Dataset:

In [34]:
data = np.random.rand(500, 10)  # 500 entities, each contains 10 features
label = np.random.randint(2, size=500)  # binary target
train_data = lgb.Dataset(data, label=label)

In [35]:
data.shape

(500, 10)

In [36]:
label.shape

(500,)

In [37]:
train_data

<lightgbm.basic.Dataset at 0x20a5b71ba88>

#### To load a scipy.sparse.csr_matrix array into Dataset:

In [38]:
import scipy
csr = scipy.sparse.csr_matrix((data, (row, col)))
train_data = lgb.Dataset(csr)

NameError: name 'row' is not defined

In [39]:
train_data = lgb.Dataset('train.svm.txt')
train_data.save_binary('train.bin')

LightGBMError: Cannot open data file train.svm.txt

In [40]:
validation_data = lgb.Dataset('validation.svm', reference=train_data)

In [41]:
train_data = lgb.Dataset(data, label=label, feature_name=['c1', 'c2', 'c3'], categorical_feature=['c3'])

In [42]:
train_data

<lightgbm.basic.Dataset at 0x20a5b70e048>

In [43]:
validation_data.categorical_feature

'auto'

In [44]:
#booster parameters:
param = {'num_leaves': 31, 'objective': 'binary'}
param['metric'] = 'auc'

In [45]:
param

{'num_leaves': 31, 'objective': 'binary', 'metric': 'auc'}

In [46]:
param['metric'] = ['auc', 'binary_logloss']

In [47]:
param

{'num_leaves': 31, 'objective': 'binary', 'metric': ['auc', 'binary_logloss']}

In [48]:
#training:
num_round = 10
bst = lgb.train(param, train_data, num_round, valid_sets=[validation_data])

ValueError: Length of feature_name(3) and num_feature(10) don't match

# Gradient Boosting with Scikit-Learn

In [49]:
# check scikit-learn version
import sklearn
print(sklearn.__version__)

0.23.2


### Classification dataset

In [50]:
# test classification dataset
from sklearn.datasets import make_classification
# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=0)
# summarize the dataset
print(X.shape, y.shape)

(1000, 10) (1000,)


### Regression dataset

In [55]:
# test regression dataset
from sklearn.datasets import make_regression
# define dataset
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, random_state=1)
# summarize the dataset
print(X.shape, y.shape)

(1000, 10) (1000,)


### Gradient Boosting Machine for Classification

In [57]:
# gradient boosting for classification in scikit-learn
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)
# evaluate the model
model = GradientBoostingClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# fit the model on the whole dataset
model = GradientBoostingClassifier()
model.fit(X, y)
# make a single prediction
row = [[2.56999479, -0.13019997, 3.16075093, -4.35936352, -1.61271951, -1.39352057, -2.48924933, -1.93094078, 3.26130366, 2.05692145]]
yhat = model.predict(row)
print('Prediction: %d' % yhat[0])

Accuracy: 0.914 (0.026)
Prediction: 1


### Gradient Boosting Machine for Regression

In [60]:
# gradient boosting for regression in scikit-learn
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from matplotlib import pyplot
# define dataset
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, random_state=1)
# evaluate the model
model = GradientBoostingRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# fit the model on the whole dataset
model = GradientBoostingRegressor()
model.fit(X, y)
# make a single prediction
row = [[2.02220122, 0.31563495, 0.82797464, -0.30620401, 0.16003707, -1.44411381, 0.87616892, -0.50446586, 0.23009474, 0.76201118]]
yhat = model.predict(row)
print('Prediction: %.3f' % yhat[0])

MAE: -11.874 (1.121)
Prediction: -80.661


# Gradient Boosting With LightGBM

In [63]:
# check lightgbm version
import lightgbm
print(lightgbm.__version__)

3.0.0


### LightGBM for Classification

In [64]:
# lightgbm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)
# evaluate the model
model = LGBMClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# fit the model on the whole dataset
model = LGBMClassifier()
model.fit(X, y)
# make a single prediction
row = [[2.56999479, -0.13019997, 3.16075093, -4.35936352, -1.61271951, -1.39352057, -2.48924933, -1.93094078, 3.26130366, 2.05692145]]
yhat = model.predict(row)
print('Prediction: %d' % yhat[0])

Accuracy: 0.934 (0.021)
Prediction: 1


### LightGBM for Regression

In [65]:
# lightgbm for regression
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from matplotlib import pyplot
# define dataset
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, random_state=1)
# evaluate the model
model = LGBMRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# fit the model on the whole dataset
model = LGBMRegressor()
model.fit(X, y)
# make a single prediction
row = [[2.02220122, 0.31563495, 0.82797464, -0.30620401, 0.16003707, -1.44411381, 0.87616892, -0.50446586, 0.23009474, 0.76201118]]
yhat = model.predict(row)
print('Prediction: %.3f' % yhat[0])

MAE: -12.739 (1.408)
Prediction: -82.040
