In [3]:
import json
import lightgbm as lgb
import pandas as pd
import scipy.io as scio
from sklearn.metrics import auc, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import matplotlib as mpl

In [4]:
%matplotlib inline
plt.rcParams['figure.figsize'] = 16,6
plt.rcParams['xtick.color'] = 'w'  
plt.rcParams['ytick.color'] = 'w'  
mpl.style.use('ggplot')

In [5]:
data_path = r"../data/data.mat"
print("load data")
data = scio.loadmat(data_path)

X_train = data['datainptrain']
y_train = data['dataouttrain']
X_test = data['datainptest']
y_test = data['dataouttest']

load data


In [6]:
y_test.shape

(4165, 5)

In [7]:
list(y_test[1])

[1, 0, 0, 0, 0]

In [8]:
#change labels from one-hot to number
y_train_new = [[] for i in range(y_train.shape[0])]
for i in range(y_train.shape[0]):
    if list(y_train[i]) == [1,0,0,0,0]:
        y_train_new[i] = 1
    if list(y_train[i]) == [0,1,0,0,0]:
        y_train_new[i] = 2
    if list(y_train[i]) == [0,0,1,0,0]:
        y_train_new[i] = 3
    if list(y_train[i]) == [0,0,0,1,0]:
        y_train_new[i] = 4    
    if list(y_train[i]) == [0,0,0,0,1]:
        y_train_new[i] = 5

y_test_new = [[] for i in range(y_test.shape[0])]
for i in range(y_test.shape[0]):
    if list(y_test[i]) == [1,0,0,0,0]:
        y_test_new[i] = 1
    if list(y_test[i]) == [0,1,0,0,0]:
        y_test_new[i] = 2
    if list(y_test[i]) == [0,0,1,0,0]:
        y_test_new[i] = 3
    if list(y_test[i]) == [0,0,0,1,0]:
        y_test_new[i] = 4    
    if list(y_test[i]) == [0,0,0,0,1]:
        y_test_new[i] = 5
        
del y_test
del y_train
y_test = y_test_new
y_train = y_train_new

In [14]:
?lgb.LGBMClassifier

In [17]:
estimator = lgb.LGBMClassifier(
    learning_rate=0.125, metric='l1', n_estimators=20, num_leaves=38, objective='multiclass')

param_grid = {
    'n_estimators': [x for x in range(20, 36, 2)],
    'learning_rate': [0.10, 0.125, 0.15, 0.175, 0.2]
}
gridsearch = GridSearchCV(estimator, param_grid)

gridsearch.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric=['xentlambda'])

LightGBMError: Multiclass objective and metrics don't match

In [None]:
print('Best parameters found by grid search are:', gridsearch.best_params_)

In [None]:

gbm = lgb.LGBMClassifier(learning_rate = 0.125, metric = 'l1', 
                        n_estimators = 20)


gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=['auc', 'binary_logloss'],
early_stopping_rounds=5)


In [None]:
ax = lgb.plot_importance(gbm, height = 0.4, 
                         max_num_features = 25, 
                         xlim = (0,100), ylim = (0,23), 
                         figsize = (10,6))
plt.show()


In [None]:
# For each feature of our dataset, the result of the following
# code snippet contains numbers of times a feature is used in a model.
sorted(gbm.feature_importances_,reverse=True)

In [None]:
# The code below aims to find where at least 85% 
temp = 0 
total = sum(gbm.feature_importances_)
for feature in sorted(gbm.feature_importances_, reverse=True):
    temp+=feature
    if temp/total >= 0.85:
        print(feature,temp/total) # stop when we 
        break

In [None]:
y_pred_prob = gbm.predict_proba(X_test)[:, 1]
auc_roc_0=str(roc_auc_score(y_test, y_pred_prob)) # store AUC score without dimensionality reduction
print('AUC without dimensionality reduction: \n' + auc_roc_0)

In [None]:
#We can choose to drop the last 6 features from in our new model to reduce dimensionality, and thus save training time and space

X = X.drop(['SEX','BILL_AMT5','MARRIAGE','EDUCATION','BILL_AMT6','BILL_AMT4'], axis=1)

In [None]:
# Remake our test/train set with our reduced dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=21)

reduc_estimator = lgb.LGBMClassifier(learning_rate = 0.125, metric = 'l1', 
                        n_estimators = 20, num_leaves = 38)

# Parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [x for x in range(20, 36, 2)],
    'learning_rate': [0.10, 0.125, 0.15, 0.175, 0.2]}

gridsearch = GridSearchCV(reduc_estimator, param_grid)

gridsearch.fit(X_train, y_train,
        eval_set = [(X_test, y_test)],
        eval_metric = ['auc', 'binary_logloss'],
        early_stopping_rounds = 5)
print('Best parameters found by grid search are:', gridsearch.best_params_)



In [None]:
gbm = lgb.LGBMClassifier(learning_rate = 0.1, metric = 'l1', 
                        n_estimators = 20)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=['auc', 'binary_logloss'],
early_stopping_rounds=5)

In [None]:
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
print('The accuracy of prediction is:', accuracy_score(y_test, y_pred))
print('The roc_auc_score of prediction is:', roc_auc_score(y_test, y_pred))
print('The null acccuracy is:', max(y_test.mean(), 1 - y_test.mean()))

In [None]:
y_pred_prob = gbm.predict_proba(X_test)[:, 1]

In [None]:
y_pred_prob

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title('ROC curve for credit card defaulting classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.grid(True)

In [None]:
auc_roc_1 = str(roc_auc_score(y_test, y_pred_prob))
print('AUC with dimensionality reduction: \n' + auc_roc_1)
print('AUC without dimensionality reduction: \n' + auc_roc_0)

In [None]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test, y_pred))