In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import time
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV,train_test_split,cross_val_score

In [2]:
cols = ['RACE','GENDER','AGE','OFFENSE','FACILITY','DETAINER','SENTENCE DAYS']
features = ['GENDER','AGE','OFFENSE','FACILITY','DETAINER','SENTENCE DAYS']
df = pd.read_csv('individuals.csv',usecols=cols)

In [3]:
samp = df.sample(20000)
samp.RACE = pd.factorize(samp['RACE'])[0] + 1
samp.GENDER = pd.factorize(samp['GENDER'])[0] + 1
samp.OFFENSE = pd.factorize(samp['OFFENSE'])[0] + 1
samp.DETAINER = pd.factorize(samp['DETAINER'])[0] + 1
samp.FACILITY = pd.factorize(samp['FACILITY'])[0] + 1
samp_y = samp.RACE
samp_X = samp[['GENDER','AGE','OFFENSE','FACILITY','DETAINER','SENTENCE DAYS']]

In [5]:
param_grid = {
    'reg_alpha': np.linspace(.1,1,5),
    'reg_lambda': np.linspace(.1,1,5)
}

clf = lgb.LGBMClassifier(boosting_type='gbdt', learning_rate=0.23, n_estimators=100)
s = time.time()
clf = GridSearchCV(clf, param_grid, cv=3, verbose=3)
clf = clf.fit(samp_X, samp_y)
e = time.time()

with open('./pickles/LGBM_fitted_grid1.pickle','wb+') as f:
    pickle.dump(clf, f)

clf = clf.best_estimator_
clf = clf.fit(samp_X, samp_y)


with open('./pickles/LGBMClf1.pickle','wb+') as f:
    pickle.dump(clf, f)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] reg_alpha=0.1, reg_lambda=0.1 ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....... reg_alpha=0.1, reg_lambda=0.1, score=0.559, total= 1.2min
[CV] reg_alpha=0.1, reg_lambda=0.1 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


[CV] ....... reg_alpha=0.1, reg_lambda=0.1, score=0.576, total=   1.5s
[CV] reg_alpha=0.1, reg_lambda=0.1 ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s


[CV] ....... reg_alpha=0.1, reg_lambda=0.1, score=0.575, total=   1.3s
[CV] reg_alpha=0.1, reg_lambda=0.325 .................................
[CV] ..... reg_alpha=0.1, reg_lambda=0.325, score=0.562, total=  35.7s
[CV] reg_alpha=0.1, reg_lambda=0.325 .................................
[CV] ..... reg_alpha=0.1, reg_lambda=0.325, score=0.570, total= 1.5min
[CV] reg_alpha=0.1, reg_lambda=0.325 .................................
[CV] ..... reg_alpha=0.1, reg_lambda=0.325, score=0.571, total=  54.5s
[CV] reg_alpha=0.1, reg_lambda=0.55 ..................................
[CV] ...... reg_alpha=0.1, reg_lambda=0.55, score=0.560, total= 1.6min
[CV] reg_alpha=0.1, reg_lambda=0.55 ..................................
[CV] ...... reg_alpha=0.1, reg_lambda=0.55, score=0.567, total= 1.5min
[CV] reg_alpha=0.1, reg_lambda=0.55 ..................................
[CV] ...... reg_alpha=0.1, reg_lambda=0.55, score=0.574, total=  39.3s
[CV] reg_alpha=0.1, reg_lambda=0.775 .................................
[CV] .

[CV] ....... reg_alpha=1.0, reg_lambda=0.1, score=0.556, total=   0.6s
[CV] reg_alpha=1.0, reg_lambda=0.1 ...................................
[CV] ....... reg_alpha=1.0, reg_lambda=0.1, score=0.566, total=   0.7s
[CV] reg_alpha=1.0, reg_lambda=0.1 ...................................
[CV] ....... reg_alpha=1.0, reg_lambda=0.1, score=0.564, total=   0.6s
[CV] reg_alpha=1.0, reg_lambda=0.325 .................................
[CV] ..... reg_alpha=1.0, reg_lambda=0.325, score=0.560, total=   0.6s
[CV] reg_alpha=1.0, reg_lambda=0.325 .................................
[CV] ..... reg_alpha=1.0, reg_lambda=0.325, score=0.572, total=   0.6s
[CV] reg_alpha=1.0, reg_lambda=0.325 .................................
[CV] ..... reg_alpha=1.0, reg_lambda=0.325, score=0.564, total=   0.7s
[CV] reg_alpha=1.0, reg_lambda=0.55 ..................................
[CV] ...... reg_alpha=1.0, reg_lambda=0.55, score=0.557, total=   0.7s
[CV] reg_alpha=1.0, reg_lambda=0.55 ..................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  8.8min finished


In [None]:
print(f'time was {(e-s)/(60*60)} hours')

In [None]:
samp = df.sample(200000)
samp.RACE = pd.factorize(samp['RACE'])[0] + 1
samp.GENDER = pd.factorize(samp['GENDER'])[0] + 1
samp.OFFENSE = pd.factorize(samp['OFFENSE'])[0] + 1
samp.DETAINER = pd.factorize(samp['DETAINER'])[0] + 1
samp.FACILITY = pd.factorize(samp['FACILITY'])[0] + 1
samp_y = samp.RACE
samp_X = samp[['GENDER','AGE','OFFENSE','FACILITY','DETAINER','SENTENCE DAYS']]


with open('./pickles/LGBMClf1.pickle','wb+') as f:
    clf = pickle.load(f)
    clf.score(samp_X, samp_y)

In [9]:
clf.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.23,
 'max_depth': 0,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 1000,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}