Traffic sign recognition system

In [None]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', None)
from glob import glob
import time
from IPython.display import display

import cv2
from cv2 import imread
from cv2 import resize as Resize
from skimage.io import imread
from skimage.transform import resize
from sklearn.cluster import MiniBatchKMeans

from sklearn.model_selection import train_test_split
from sklearn import metrics as metrics
from sklearn.model_selection import cross_validate
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier

np.random.seed(42)

### ML functions

In [None]:
def myTimer(func):
    def wrapper(*args, **kwargs):
        t   = time.time()
        ret = func(*args, **kwargs)
        print('computation time = %s s' % (time.time() - t))
        return(ret)
    return(wrapper)

@myTimer
def doML(alg, settings, XTrain, YTrain, XTest, YTest, title=None):
    if title:
        print('====   %s   ====' % title)
    else:
        print('===========================================================================')
    print(alg)
    print(settings)
    myAlg = alg(**settings)
    myAlg.fit(XTrain, YTrain)
    YPred = myAlg.predict(XTest)
    print(metrics.classification_report(YTest, YPred))
    print('===========================================================================')
    return(myAlg)

@myTimer
def doCV(alg, grid, X, Y, title=None):
    if title:
        print('%s' % title)
    else:
        print('===========================================================================')

    myGS = GridSearchCV(alg(), grid)
    myGS.fit(X, Y)

    res = pd.DataFrame(myGS.cv_results_)
    display(res)
    return(myGS)

### Pre-Processing Pipeline
    read single row of pandas dataFrame and load image
    increase brightness
    apply region of interest
    scales according to nuShape
    returns gray scale/colour image

In [None]:
def increase_brightness(img, value=20):
    '''
    Simple approach for improving image brightness
    Input/output images in RGB
    '''
    hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
    h, s, v = cv2.split(hsv)

    lim = 255 - value
    v[v > lim] = 255
    v[v <= lim] += value

    final_hsv = cv2.merge((h, s, v))
    img = cv2.cvtColor(final_hsv, cv2.COLOR_HSV2RGB)
    return img

def add_noise(img, mean=0, sigma=5):
    '''
    Add Gaussian noise to the image
    '''
    noisy_img = np.clip(img + np.random.normal(mean, sigma, img.shape), 0, 255).astype(np.uint8)
    return noisy_img

def transformIMG(pandasRow, newShape=(50, 50), color=False):
    '''
    Load csv file, load and pre-process image
    Returns grayscale or color image
    '''
    img = np.asarray(imread('../input/gtsrb-german-traffic-sign/' + pandasRow['Path']))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    imgBright = increase_brightness(img, value=40)
    imgROI = imgBright[pandasRow['Roi.Y1']:pandasRow['Roi.Y2'], pandasRow['Roi.X1']:pandasRow['Roi.X2']]

    # Adding noise to disrupt features
    imgNoisy = add_noise(imgROI, sigma=50)

    imgTrafo = cv2.resize(imgNoisy, newShape, interpolation=cv2.INTER_LINEAR)
    imgGray = cv2.cvtColor(imgTrafo.astype(np.uint8), cv2.COLOR_RGB2GRAY)
    return imgTrafo if color else imgGray

def prepData(pdDataFrame, nuShape, col=False):
    '''
    load all images, call pre-processing
    return table of flattened images and labels
    '''
    X, Y = [], []
    x, y = np.ogrid[ 0:nuShape[0], 0:nuShape[0] ]
    x, y = x - int(.5*nuShape[0]), y - int(.5*nuShape[0])
    if col:
        for k in range(pdDataFrame.shape[0]):
            d = pdDataFrame.iloc[k]
            img = transformIMG(d, newShape=nuShape, color=col)
            X.append(img[x**2 + y**2 <= .25*nuShape[0]**2,:])
            Y.append( d['ClassId'] )
    else:
        for k in range(pdDataFrame.shape[0]):
            d = pdDataFrame.iloc[k]
            img = transformIMG(d, newShape=nuShape)
            X.append(img[x**2 + y**2 <= .25*nuShape[0]**2])
            Y.append( d['ClassId'] )
    return(np.asarray(X), np.asarray(Y))

### Load full data set

In [None]:
data = pd.read_csv('../input/gtsrb-german-traffic-sign/Train.csv')
nuShape = (20,20)
X, Y = prepData(data, nuShape)

### reduce amount of data using PCA

In [None]:
myPCA = PCA(n_components=100)
XPCA = myPCA.fit_transform(X)

In [None]:
XFTrain, XFTest, YFTrain, YFTest = train_test_split(XPCA, Y, test_size=.55, random_state=42)#

### quick check of random forest and MLP: computation time and first scores with default settings

In [None]:
settingsRFC = {'n_jobs' : 8}
o = doML(RandomForestClassifier, settingsRFC, XFTrain, YFTrain, XFTest, YFTest, title='Random Forest Classifier')

====   Random Forest Classifier   ====
<class 'sklearn.ensemble._forest.RandomForestClassifier'>
{'n_jobs': 8}
              precision    recall  f1-score   support

           0       1.00      0.20      0.33       112
           1       0.69      0.78      0.73      1215
           2       0.64      0.75      0.69      1231
           3       0.58      0.49      0.53       767
           4       0.65      0.70      0.67      1109
           5       0.42      0.47      0.44      1027
           6       0.68      0.84      0.75       226
           7       0.68      0.54      0.60       797
           8       0.52      0.46      0.49       794
           9       0.95      0.85      0.89       860
          10       0.72      0.90      0.80      1067
          11       0.71      0.86      0.78       673
          12       0.78      0.94      0.85      1168
          13       0.89      0.95      0.92      1151
          14       0.99      0.94      0.96       429
          15       0.96 

In [None]:
settingsMLP = {}
o = doML(MLPClassifier, settingsMLP, XFTrain, YFTrain, XFTest, YFTest, title=None)

<class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>
{}
              precision    recall  f1-score   support

           0       0.79      0.72      0.76       112
           1       0.79      0.76      0.77      1215
           2       0.69      0.79      0.73      1231
           3       0.58      0.63      0.60       767
           4       0.70      0.72      0.71      1109
           5       0.57      0.49      0.53      1027
           6       0.75      0.76      0.75       226
           7       0.67      0.62      0.64       797
           8       0.63      0.58      0.61       794
           9       0.89      0.85      0.87       860
          10       0.81      0.86      0.84      1067
          11       0.81      0.83      0.82       673
          12       0.89      0.86      0.88      1168
          13       0.93      0.94      0.94      1151
          14       0.99      0.95      0.97       429
          15       0.82      0.82      0.82       333
        

### quick CV of all methods using default settings

In [None]:
gridRFC = {'n_jobs' : [8]}
o = doCV(RandomForestClassifier, gridRFC, XPCA, Y, title='Random Forest')

Random Forest


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_jobs,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,59.951654,3.778309,0.415855,0.071491,8,{'n_jobs': 8},0.616297,0.682351,0.641928,0.635042,0.668155,0.648755,0.023636,1


computation time = 375.4058954715729 s


In [None]:
gridMLP = {}
o = doCV(MLPClassifier, gridMLP, XPCA, Y, title='MLP')

MLP


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,56.206551,11.236441,0.032982,0.007687,{},0.684392,0.750701,0.711426,0.725835,0.74238,0.722947,0.023561,1


computation time = 334.5479383468628 s


### Best RFC after Grid Search

In [None]:
gridRFC = {'n_estimators' : 200, 'max_depth' : None, 'min_samples_split' : 10, \
           'max_features' : 'sqrt', 'max_leaf_nodes' : None, 'min_impurity_decrease' : 0.0, \
           'bootstrap' : False, 'oob_score' : False, 'class_weight' : 'balanced', \
           'ccp_alpha' : 0.0, 'max_samples' : None}
o = doML(RandomForestClassifier, gridRFC, XFTrain, YFTrain, XFTest, YFTest, title='Best Random Forest')

====   Best Random Forest   ====
<class 'sklearn.ensemble._forest.RandomForestClassifier'>
{'n_estimators': 200, 'max_depth': None, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'bootstrap': False, 'oob_score': False, 'class_weight': 'balanced', 'ccp_alpha': 0.0, 'max_samples': None}
              precision    recall  f1-score   support

           0       0.90      0.41      0.56       112
           1       0.80      0.75      0.78      1215
           2       0.78      0.71      0.74      1231
           3       0.61      0.60      0.60       767
           4       0.73      0.73      0.73      1109
           5       0.51      0.47      0.49      1027
           6       0.71      0.93      0.80       226
           7       0.75      0.62      0.67       797
           8       0.54      0.61      0.57       794
           9       0.95      0.86      0.90       860
          10       0.79      0.89      0.84      1067
         

In [None]:
gridRFC = {'n_estimators' : [200], 'max_depth' : [None], 'min_samples_split' : [10], \
           'max_features' : ['sqrt'], 'max_leaf_nodes' : [None], 'min_impurity_decrease' : [0.0], \
           'bootstrap' : [False], 'oob_score' : [False], 'class_weight' : ['balanced'], \
           'ccp_alpha' : [0.0], 'max_samples' : [None]}
o = doCV(RandomForestClassifier, gridRFC, XPCA, Y, title='best RFC')

best RFC


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_ccp_alpha,param_class_weight,param_max_depth,param_max_features,param_max_leaf_nodes,param_max_samples,param_min_impurity_decrease,param_min_samples_split,param_n_estimators,param_oob_score,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,259.529514,3.179032,0.821562,0.030516,False,0.0,balanced,,sqrt,,,0.0,10,200,False,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.650854,0.704795,0.658761,0.660418,0.690728,0.673111,0.020864,1


computation time = 1632.4902772903442 s


### best MLP after Grid Search

In [None]:
gridMLP = {'hidden_layer_sizes' : [(500)], 'activation' : ['relu'], 'max_iter' : [2000], \
          'solver' : ['adam'], 'batch_size' : [100], 'learning_rate' : ['constant']}
o = doCV(MLPClassifier, gridMLP, XPCA, Y, title='MLP')

NameError: name 'doCV' is not defined

In [None]:
settingsMLP = {'hidden_layer_sizes' : (500), 'activation' : 'relu', 'max_iter' : 2000, \
          'solver' : 'adam', 'batch_size' : 100, 'learning_rate' : 'constant'}
res = doML(MLPClassifier, settingsMLP, XFTrain, YFTrain, XFTest, YFTest, title='best MLP')

In [None]:
settingsMLP = {'hidden_layer_sizes' : (500), 'activation' : 'relu', 'max_iter' : 2000, \
          'solver' : 'adam', 'batch_size' : 100, 'learning_rate' : 'constant'}
bestMLP = doML(MLPClassifier, settingsMLP, XPCA, Y, XPCA, Y, title='best MLP')