In [102]:
# Imports required for Machine Learning
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Dropout, Conv2D, MaxPooling2D
import numpy as np
import keras.utils
import keras

# Reading from table-structured data easily
import pandas as pd

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Libaries we use for image / data augmentation
from keras.preprocessing.image import ImageDataGenerator
from random import randint

%matplotlib inline

In [103]:
trainData   = "/Users/CollinHeist/Dropbox/Machine Learning/data/UI Data Science/trainData.csv"
trainLabels = "/Users/CollinHeist/Dropbox/Machine Learning/data/UI Data Science/trainLabels.csv"
testData    = "/Users/CollinHeist/Dropbox/Machine Learning/data/UI Data Science/testData.csv"
correctData = "/Users/CollinHeist/Dropbox/Machine Learning/data/UI Data Science/correct.csv"
outputPath   = "/Users/CollinHeist/Dropbox/Machine Learning/data/UI Data Science/output.csv"

In [104]:
# Returns a matrix of all the train labels 
def obtainTrainLabels(path):
    df = pd.read_csv(path)
    df = df.drop(["index"], axis = 1) # Drop the first column
    dm = df.as_matrix() # Convert the dataframe to a matrix
    
    return dm

In [105]:
# Returns a dataframe of the data in /Path/
def obtainTestData(path):
    df = pd.read_csv(path)
    df = df.drop(["index"], axis = 1) # Drop the index column

    return df

In [106]:
# Returns a dataframe of the data in /Path/
def obtainData(path):
    df = pd.read_csv(path)
    df = df.drop(["index"], axis = 1) # Remove the counting column
    
    return df

In [107]:
# Return a list of images, each one being a 2D 24x120 array of an expression
def formatTestData(dataFrame):
    testMatrix = dataFrame.as_matrix() # Convert the dataframe to a Matrix
    imageList = [] # A list of all 2D digits - each digit is a 24x24 array
    for x in testMatrix:
        x = x.reshape(24, 120)
        imageList.append(x)
        
    return imageList

In [108]:
# Returns a list of 2D arrays for each symbol 
def formatData(dataFrame):
    trainMatrix = dataFrame.as_matrix() # Convert the dataframe to a Matrix
    digitList = [] # A list of all 2D digits - each digit is a 24x24 array
    for x in trainMatrix:
        x = x.reshape(24, 24)
        digitList.append(x)
        
    return digitList

In [109]:
# Returns a tuple of three np arrays, one for the image data and one for the image labels
# Designed to split all operators (+, -, =) into a separate matrix for a separate model
def splitDigitLists(trainMatrix, trainLabels):
    symbolList   = []
    symbolLabels = []
    indexList    = []
    for count, label in np.ndenumerate(trainLabels):
        if label >= 10: indexList.append(count[0])
                        
    # Grab the data of all the symbols and put them in lists
    for indexes in indexList:
        symbolList.append(trainMatrix[indexes])
        symbolLabels.append(trainLabels[indexes][0])
        
    # Make an array of the new training labels to return, to override the old trainLabels
    newTrainLabels = []
    for i in range(trainLabels.shape[0]):
        if i not in indexList:
            newTrainLabels.append(trainLabels[i][0])
    newLabels = np.array(newTrainLabels).reshape(len(newTrainLabels), 1)
    
    # Delete the symbol arrays from the trainMatrix
    # Flip the order so we can erase the latter elements first - avoid index errors
    il = list(reversed(indexList))
    for indexes in il:
        del trainMatrix[indexes]
    
    # Convert the lists of labels and image data to numpy arrays
    sList   = np.array(symbolList)
    sLabels = np.array(symbolLabels)
    sLabels = np.subtract(sLabels, 10) # Shift the 10, 11, and 12 values to 0, 1, 2 for one hot encoding
            
    return (sList, sLabels, newLabels)

In [250]:
# Returns a Sequential Convolutional model structured as model.summary()
# We use the 'same' padding to avoid major downsampling on the image - allows for a deeper network
def createModel(isDigit):
    model = Sequential()
    model.add(Conv2D(64, kernel_size = (5, 5), strides = (1, 1), activation = "relu",
                     input_shape = (24, 24, 1)))
    model.add(MaxPooling2D(pool_size = (2, 2), strides = (2, 2)))
    model.add(Dropout(0.375))
    
    model.add(Conv2D(128, kernel_size = (5, 5), activation = "relu", padding = "same"))
    model.add(MaxPooling2D(pool_size = (2, 2), strides = (2, 2)))
    model.add(Dropout(0.425))
    
    model.add(Conv2D(256, kernel_size = (3, 3), activation = "relu", padding = "same"))
    model.add(MaxPooling2D(pool_size = (2, 2), strides = (2, 2)))
    model.add(Dropout(0.375))
    
    if isDigit:
        model.add(Conv2D(128, kernel_size = (3, 3), activation = "relu", padding = "same"))
        model.add(MaxPooling2D(pool_size = (2, 2), strides = (2, 2)))
        model.add(Dropout(0.275))
    
    model.add(Flatten())
    model.add(Dense(1024, activation = "relu"))
    model.add(Dropout(0.185))
    
    # The final layer is either 10 classes or 3 classes, depending on if it's a digit or not
    if isDigit:
        model.add(Dense(10, activation = "softmax"))
    else:
        model.add(Dense(3, activation = "softmax"))

    model.compile(loss = keras.losses.categorical_crossentropy,
                  optimizer = keras.optimizers.SGD(lr = 0.0075),
                  metrics = ["accuracy"])
    
    # print (model.summary())
    return model

In [243]:
# Change the passed training digits by adding, in the background, a randomly selected digit
# also from the training set. This is done to more closely resemble the test-data, which is
# artifically altered with background numbers to decrease the accuracy of the model.
# this changes approximately probability% of the data to be more complex
def augmentImages(trainDigits, probability):
    inputLength = len(trainDigits)
    for i, image in enumerate(trainDigits):
        randIndex = randint(0, inputLength - 1)
        # Probabilistically change the data
        if randIndex > (inputLength * ((100 - probability) / 100.0)): 
            # Grab a random image from the training data
            randSelect = randint(0, inputLength - 1)
            bgImage = trainDigits[randSelect]
            
            # Reduce the intensity of the background image - Make sure there are no negatives
            bgImage = np.subtract(bgImage, (175.0 / 255.0))
            bgImage[bgImage < 0] = 0
            
            # Add the background image to the foreground - to be altered - image
            # During this addition, pixel values can become > 1, so process those between [0, 1]
            newImage = np.add(bgImage, image)
            newImage[newImage > 1] = 1
            
            # plt.subplot(150)
            # plt.imshow(image.reshape(24, 24), cmap = plt.get_cmap('gray'))
            trainDigits[i] = newImage
            # plt.subplot(151)
            # plt.imshow(image.reshape(24, 24), cmap = plt.get_cmap('gray'))
            # plt.show()
            
    return trainDigits

In [246]:
# Training values
dataframe = obtainData(trainData)
digitList = formatData(dataframe)
digitLabels = obtainTrainLabels(trainLabels)
# Split the data into a list of symbols and a list of digits for separate training
symbolList, symbolLabels, digitLabels = splitDigitLists(digitList, digitLabels)
# Testing values
testDF = obtainTestData(testData)
testDL = formatTestData(testDF)

In [247]:
# Hyperparameters and the training data definition(s)
# Convert training data to shape (numberSamples, xSize, ySize, numChannels)
x_train_digits  = np.asarray(digitList).reshape(len(digitList), 24, 24, 1)
x_train_symbols = np.asarray(symbolList).reshape(len(symbolList), 24, 24, 1)
# Convert y_train to a one-hot encoded vector for each class - 10 for digits, 3 for symbols
y_train_digits  = keras.utils.to_categorical(digitLabels, 10)
y_train_symbols = keras.utils.to_categorical(symbolLabels, 3)

augmentedProbability = 50
x_train_digits = augmentImages(x_train_digits, augmentedProbability)

batch_size = 64
epochs = 75

In [None]:
# Create our model, and then train it on the training data
# Uses the above-defined hyperparameters
dModel = createModel(True) # Convolutional network for recognizing digits
sModel = createModel(False) # Convolutional network for recognizing symbols

# Using an ImageGenerator, modify the data during the training process
# The generator randomly zooms in / out, rotates and shifts the image
zoomRange = .100   # 0-1 value that represents how much the image can be scaled
rotationRange = 24 # Maximum degree value at which to rotate the image
shiftVal = .0625    # Maximum percentage at which the image can be shifted
datagen = ImageDataGenerator(zoom_range = zoomRange,
                             rotation_range = rotationRange, 
                             width_shift_range = shiftVal, height_shift_range = shiftVal)

datagen.fit(x_train_digits)

# In-line train the model while generating new data with the ImageGenerator
# dHistory = dModel.fit_generator(
#            datagen.flow(x_train_digits, y_train_digits,
#                         batch_size = batch_size),
#            steps_per_epoch = len(x_train_digits) / batch_size,
#            epochs = epochs,
#            verbose = 1)

# Store the history of the training in 'history'
sHistory = sModel.fit(x_train_symbols, y_train_symbols,
                      batch_size = batch_size, epochs = 20,
                      verbose = 1)
dHistory = dModel.fit(x_train_digits, y_train_digits,
                      batch_size = batch_size, epochs = 75,
                      verbose = 1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75

In [252]:
# Look at each sub-image in each full sized equation
d = ["+", "-", "=="] # Reference dictionary for the symbols, use == so expr() can be used
testEvaluations = [] # A list of all of our guesses - in numerical order

# Read the old, already correct values
correctEvals = []
# cDF = pd.read_csv(correctData, names = ["index", "values"])
# cDF = cDF.drop("index", axis = 1)
# try:
#     cDF = cDF.iloc[1:] # Remove first row of leading 0
#     correctList = cDF.values.tolist()
#     correctList = [x[0] for x in correctList]
# except IndexError:
#     correctList = []
# startIndex = cDF.shape[0]

evalList = []

# We use the digit model for all digits (i.e. the 1st, 3rd, and 5th 'image')
# and use the symbol model for the symbols (2nd and 4th 'image')
for c, image in enumerate(testDL):
    image[image < .25] = 0
    
    char1 = image[:24,   :24].reshape(1, 24, 24, 1)
    pred1 = dModel.predict_classes(char1)

    char2 = image[:24, 24:48].reshape(1, 24, 24, 1)
    pred2 = sModel.predict_classes(char2)

    char3 = image[:24, 48:72].reshape(1, 24, 24, 1)
    pred3 = dModel.predict_classes(char3)

    char4 = image[:24, 72:96].reshape(1, 24, 24, 1)
    if pred2 is 0 or 1: # If the previous symbol was arithmetic, we know the next will be an = sign
        pred4 = [2]
    else: 
        pred4 = sModel.predict_classes(char4)

    char5 = image[:24, 96:  ].reshape(1, 24, 24, 1)
    pred5 = dModel.predict_classes(char5)

    # Make one array containing each guess
    guess = np.concatenate((pred1, pred2, pred3, pred4, pred5))
    # Convert each element to either it's string equivalent, or the equiv. sign
    expr  = [d[x] if (i in (1, 3)) else str(x) for i, x in enumerate(guess)]
    expr = ''.join(expr) # Convert the array to one string
        
    # Evaluate the single expression - If there was a syntax error, guess a 0    
    try: # print (expr, eval(expr))
        testEvaluations.append(1 if eval(expr) else 0)
    except SyntaxError:
        testEvaluations.append(0)
    if c % 100 == 0:
        print ("Evluated.. %i" % c)

print (evalList)

# correctList.extend(correctEvals)
# print ("Corrected {:.3%} of all the data.".format(len(correctList) / 20000.0))
# correctDF = pd.DataFrame(correctList)
# correctDF.to_csv(correctData)

dtf = pd.DataFrame(testEvaluations)
# dtf.update(correctDF) # Change the first 'n' values to the corrected ones
dtf = dtf.astype(int) # Recast the datatype as an integer
dtf.to_csv(outputPath) # Change top row to ["index", "label"]

Evluated.. 0
Evluated.. 100
Evluated.. 200
Evluated.. 300
Evluated.. 400
Evluated.. 500
Evluated.. 600
Evluated.. 700
Evluated.. 800
Evluated.. 900
Evluated.. 1000
Evluated.. 1100
Evluated.. 1200
Evluated.. 1300
Evluated.. 1400
Evluated.. 1500
Evluated.. 1600
Evluated.. 1700
Evluated.. 1800
Evluated.. 1900
Evluated.. 2000
Evluated.. 2100
Evluated.. 2200
Evluated.. 2300
Evluated.. 2400
Evluated.. 2500
Evluated.. 2600
Evluated.. 2700
Evluated.. 2800
Evluated.. 2900
Evluated.. 3000
Evluated.. 3100
Evluated.. 3200
Evluated.. 3300
Evluated.. 3400
Evluated.. 3500
Evluated.. 3600
Evluated.. 3700
Evluated.. 3800
Evluated.. 3900
Evluated.. 4000
Evluated.. 4100
Evluated.. 4200
Evluated.. 4300
Evluated.. 4400
Evluated.. 4500
Evluated.. 4600
Evluated.. 4700
Evluated.. 4800
Evluated.. 4900
Evluated.. 5000
Evluated.. 5100
Evluated.. 5200
Evluated.. 5300
Evluated.. 5400
Evluated.. 5500
Evluated.. 5600
Evluated.. 5700
Evluated.. 5800
Evluated.. 5900
Evluated.. 6000
Evluated.. 6100
Evluated.. 6200
Evlu

In [None]:
## shiftVal = .125
rotationRange = 38
zoomRange = .185
# datagen = ImageDataGenerator(zoom_range = zoomRange,
#                              rotation_range = rotationRange, 
#                              width_shift_range = shiftVal, height_shift_range = shiftVal)
# datagen.fit(x_train_digits)
for x_batch in x_train_digits:
    for i in range(0, 9):
        plt.subplot(330 + 1 + i)
        plt.imshow(x_train_digits[i].reshape(24, 24), cmap = plt.get_cmap('gray'))
    plt.show()
    break