In [61]:
import numpy as np
import math
import os
from sklearn.model_selection import train_test_split
import cv2
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, roc_auc_score
import random

Part 1:
Random Forest Model trained on 13,000 shapes from Honda

In [2]:
shapes = np.load('all_shapes.npy')
'''
'shapes' is a variable identical to the 'all_shapes' variable from labels.py. 
I saved the 'all_shapes' variable in a numpy file so that I could call it from within this notebook.
'''

In [3]:
def maxdim(shapes):
    maxh = 0
    maxw = 0
    for shape in shapes:
        if shape.h > maxh:
            maxh = shape.h
        if shape.w > maxw:
            maxw = shape.w
    return(maxh,maxw)

In [7]:
shapes1 = np.copy(shapes)
maxh,maxw = maxdim(shapes1)
for shape in shapes1:
    shape.pad(maxh,maxw)
    #shape.flatten()

In [8]:
#padding and flattening shapes from all_shapes.
X1 = []
for shape in shapes1:
    h, w = shape.cropped.shape
    padh = (maxh - h) / 2
    padw = (maxw - w) / 2
    padshape = np.pad(shape.cropped, ((math.floor(padh), math.ceil(padh)), (math.floor(padw), math.ceil(padw))), 'constant', constant_values = 255)
    X1.append(cv2.bitwise_not(padshape).flatten())
X1 = np.array(X1)

In [13]:
#creating labels for shapes.
y = []
for shape in shapes1:
    y.append(shape.label[0])
y = np.uint8(y)

In [38]:
#train test split on shapes and their respective labels
Xs_train, Xs_test, ys_train, ys_test = train_test_split(X1, y, test_size=0.20, random_state=34)

In [18]:
#Hypertuning random forest model and fitting to training data
rfc = RandomForestClassifier(random_state=0) 
'''
param_grid = { 
    'n_estimators': [50, 100, 600],
    'max_depth': [3, 5, 10, 20, 40, 80, 100],
    'max_features': ['auto', 'sqrt', 'log2']
}
'''
param_grid = { 
    'n_estimators': [50],
    'max_depth': [3, 5, 10, 20, 50],
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 8)
CV_rfc.fit(Xs_train, ys_train)
print(CV_rfc.best_params_)
'''
for a variety of possible max_depths, output the one that produces the model with the best accuracy.
'''

{'max_depth': 50, 'n_estimators': 50}


In [39]:
'''
Testing our best model. 
'''
clf = RandomForestClassifier(n_estimators=100, max_depth=50, random_state=0)
#.813
clf.fit(Xs_train, ys_train)
clf.score(Xs_test, ys_test)

0.9142530487804879

Part 2:
LightGBM and Gradient Boosting Fiddling. Inconsequential. LightGBM cant do multiclass classification, and Gradient Boosting takes too long.

In [28]:
ldata = lgb.Dataset(Xs_train, label=ys_train)
gridParams = {
    'learning_rate': [0.005],
    'n_estimators': [40],
    'num_leaves': [6,8,12,16],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [501], # Updated from 'seed'
    'colsample_bytree' : [0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }
mdl = lgb.LightGBMClassifier()
grid = GridSearchCV(mdl, gridParams,
                    verbose=0,
                    cv=4,
                    n_jobs=2)
grid.fit(ldata, ys_train)

AttributeError: module 'lightgbm' has no attribute 'LightGBMClassifier'

In [24]:
ldata = lgb.Dataset(Xs_train, label=ys_train)

In [30]:
testdata = lgb.Dataset(Xs_test, label=ys_test)

In [27]:
#validation_data = ldata.create_valid('validation.svm')
validation_data = lgb.Dataset('validation.svm', reference=ldata)

In [31]:
param = {'num_leaves':150, 'objective':'binary','max_depth':7,'learning_rate':.05,'max_bin':200}
num_round = 10
bst = lgb.train(param, ldata, num_round, valid_sets=[testdata])

[1]	valid_0's binary_logloss: 0.316781
[2]	valid_0's binary_logloss: 0.300947
[3]	valid_0's binary_logloss: 0.287444
[4]	valid_0's binary_logloss: 0.274985
[5]	valid_0's binary_logloss: 0.264928
[6]	valid_0's binary_logloss: 0.255964
[7]	valid_0's binary_logloss: 0.247799
[8]	valid_0's binary_logloss: 0.240517
[9]	valid_0's binary_logloss: 0.233739
[10]	valid_0's binary_logloss: 0.227335


In [53]:
ys_train.shape

(10493,)

In [41]:
gbclf = GradientBoostingClassifier(learning_rate=.07, n_estimators=100, max_depth=10, random_state=0)
#.813
gbclf.fit(Xs_train, ys_train)
gbclf.score(Xs_test, ys_test)

KeyboardInterrupt: 

Part 3:
Random Forest Model built from small shapes. These small shapes (~100,000) were taken from an rcc run.

In [50]:
shapes0 = os.listdir("./0")
shapes1 = os.listdir("./1")
shapes2 = os.listdir("./2")
shapes3 = os.listdir("./3")
'''
I copied the directories from /project2/meng29511/Group4/small_labels/ from midway into my local computer. 
Here I am finding the file locations for all these images
'''

In [54]:
Xsmall,ysmall = [], []
for i, shapes in enumerate([shapes0, shapes1, shapes2, shapes3]):
    for loc in shapes:
        img = cv2.imread('./{}/'.format(i) + loc, -1)
        Xsmall.append(img)
        ysmall.append(i)
'''
getting images for each file location
'''

In [55]:
len(shapes0)

23815

In [58]:
np.array(Xsmall).shape

(112997,)

In [70]:
Xsmall1 = np.copy(Xsmall)
ysmall1 = np.copy(ysmall)

In [72]:
random.Random(4).shuffle(Xsmall1)
random.Random(4).shuffle(ysmall1)

In [103]:
subXsmall = Xsmall1[:30000]
subysmall = ysmall1[:30000]

In [86]:
maxl = 0
for img in Xsmall1:
    if len(img) > maxl:
        maxl = len(img)
#rows = [shape.padded for shape in shapes1]

In [153]:
Xsmall1 = np.array(Xsmall1)

In [161]:
l = newXsmall.reshape(newXsmall.shape[0], newXsmall.shape[1])
l.shape

(112997, 147)

In [98]:
newXsmall = []
for img in Xsmall1:
    padl = maxl - len(img)
    imgpad = np.pad(img, (0, padl), 'constant', constant_values=(255))
    newXsmall.append(cv2.bitwise_not(imgpad))

In [120]:
newXsmall = np.array(newXsmall)
subXsmall = newXsmall.reshape(newXsmall.shape[0], newXsmall.shape[1])[:30000]

In [162]:
X_train, X_test, y_train, y_test = train_test_split(newXsmall.reshape(newXsmall.shape[0], newXsmall.shape[1]), ysmall1, test_size=0.20, random_state=89)

In [163]:
clf = RandomForestClassifier(n_estimators=100, max_depth=50, random_state=55)
#.813
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9884513274336283

In [113]:
m = X_train.reshape(X_train.shape[0], X_train.shape[1])

In [125]:
for 
print(clf.predict(X_test[30:31]))
print(y_test[30:31])

[0]
[0]


In [142]:
X_test[11814:11815]

array([], shape=(0, 147), dtype=uint8)

In [144]:
test = random.sample(range(0, 6000), 10)
print(test)
for i in test:
    print(clf.predict(X_test[i:i+1]))
    print(y_test[i:i+1])
    print("-------------------------")

[5044, 1502, 4104, 5846, 4615, 1389, 3467, 913, 3286, 294]
[1]
[1]
-------------------------
[3]
[3]
-------------------------
[3]
[3]
-------------------------
[3]
[3]
-------------------------
[3]
[3]
-------------------------
[0]
[0]
-------------------------
[1]
[1]
-------------------------
[0]
[0]
-------------------------
[3]
[3]
-------------------------
[1]
[1]
-------------------------
