In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier


In [2]:
def CreateDataFrame(N):
 columns = ['a','b','c','y']
 df = pd.DataFrame(columns=columns)
 for i in range(N):
  a = np.random.randint(10)
  b = np.random.randint(20)
  c = np.random.randint(5)
  y = "normal"
  if((a+b+c)>25):
   y="high"
  elif((a+b+c)<12):
   y= "low"

  df.loc[i]= [a, b, c, y]
 return df

In [3]:
df = CreateDataFrame(200)
df.head()


Unnamed: 0,a,b,c,y
0,1,17,0,normal
1,9,11,1,normal
2,8,5,1,normal
3,6,16,3,normal
4,2,6,0,low


In [5]:
X = df[["a","b","c"]]
Y = df[["y"]]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, Y, random_state=0)

In [6]:
# build and train GradientBoostingClassifier model
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=1)
gbc.fit(Xtrain, np.ravel(ytrain, order='C'))
ypred = gbc.predict(Xtest)
print(gbc.score(Xtest, ytest))
print(confusion_matrix(ytest, ypred)) 

0.94
[[ 3  0  1]
 [ 0  9  1]
 [ 0  1 35]]


In [7]:
# find optimal learning rate value
learning_rate =  [0.01, 0.05, 0.1, 0.5, 1];
for n in learning_rate:
 gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=n, max_depth=1)
 gbc.fit(Xtrain, np.ravel(ytrain, order='C'))
 ypred = gbc.predict(Xtest)
 acc=gbc.score(Xtest, ytest) 
 print("Learning rate: ",n, "  Accuracy: ", acc)
 print("Confusion matrix:")
 print(confusion_matrix(ytest, ypred))

Learning rate:  0.01   Accuracy:  0.76
Confusion matrix:
[[ 0  0  4]
 [ 0  6  4]
 [ 0  4 32]]
Learning rate:  0.05   Accuracy:  0.84
Confusion matrix:
[[ 2  0  2]
 [ 0  6  4]
 [ 1  1 34]]
Learning rate:  0.1   Accuracy:  0.88
Confusion matrix:
[[ 2  0  2]
 [ 0  8  2]
 [ 1  1 34]]
Learning rate:  0.5   Accuracy:  0.94
Confusion matrix:
[[ 3  0  1]
 [ 0  9  1]
 [ 0  1 35]]
Learning rate:  1   Accuracy:  0.94
Confusion matrix:
[[ 4  0  0]
 [ 0  9  1]
 [ 0  2 34]]


In [8]:
# find optimal number of estimators
estimators =  [10,50,100,200,500];
for e in estimators:
 gbc = GradientBoostingClassifier(n_estimators=e, learning_rate=1, max_depth=1)
 gbc.fit(Xtrain, np.ravel(ytrain, order='C'))
 ypred = gbc.predict(Xtest)
 acc=gbc.score(Xtest, ytest) 
 print("Number of estimators: ",e, "  Accuracy: ", acc)
 print("Confusion matrix:")
 print(confusion_matrix(ytest, ypred)) 

Number of estimators:  10   Accuracy:  0.9
Confusion matrix:
[[ 3  0  1]
 [ 0  9  1]
 [ 1  2 33]]
Number of estimators:  50   Accuracy:  0.92
Confusion matrix:
[[ 3  0  1]
 [ 0  9  1]
 [ 0  2 34]]
Number of estimators:  100   Accuracy:  0.94
Confusion matrix:
[[ 4  0  0]
 [ 0  9  1]
 [ 0  2 34]]
Number of estimators:  200   Accuracy:  0.94
Confusion matrix:
[[ 4  0  0]
 [ 0  9  1]
 [ 0  2 34]]
Number of estimators:  500   Accuracy:  0.94
Confusion matrix:
[[ 4  0  0]
 [ 0  9  1]
 [ 0  2 34]]
