The cell below imports all necessary libraries and defines some global constants.  

In [56]:
from matplotlib.image import imread
from sklearn.model_selection import train_test_split
from sklearn.cluster import MiniBatchKMeans
import numpy as np
import pandas as pd

# each image dimension is (128, 384). After segmentation the following will be the dimensions of each character.
charHeight = 128
charWidth = 128
numImages = 50000

vectorLength = 16384

imgPath = "SoML-50/SoML-50/data/"
csvPath = "SoML-50/SoML-50/annotations.csv"

def getPathOfImg (index):
    return (imgPath + str (index) + ".jpg")

def getLabelOfImg (df,index):
    return (df.loc[df['Image'] == (str(index) + '.jpg')]['Label'].values[0])

def getValueOfImg (df, index):
    return (int(df.loc[df['Image'] == (str(index) + '.jpg')]['Value'].values[0]))


Now we divide the data set into training and testing sets

In [57]:

df = pd.read_csv (csvPath)

print (getLabelOfImg (df,5))
print (getValueOfImg (df,6))

prefix
4


The below cell takes the image number and returns the numpy array of length 3 - Ready for applying K-means clustering algorithm.

In [58]:
def vectoriseImg (image):
    return np.reshape (image, (1,-1))

def getSegmentedVectors (df,index):
    """ This function returns a numpy array of the three character images of shape (128,128) present in index.jpg after converting them into vectors 
    as required by K-Means algorithm. You cannot directly apply K-Means to image Matrices. First need to vectorise image matrix. 
    Also, the operator is always present at [0] and other two operands at [1] and [2] in the order in which the operator has to be applied."""
    
    image = imread (getPathOfImg(index))
    label = getLabelOfImg (df, index)
    if (label == 'prefix'):
        charArray = np.array ([image[:, 0:charWidth],image[:, charWidth:(2*charWidth)],image[:, (2*charWidth):]])
    elif (label == 'postfix'):
        charArray = np.array ([image[:, (2*charWidth):],image[:, 0:charWidth],image[:, charWidth:(2*charWidth)]])
    else:
        charArray = np.array ([image[:, charWidth:(2*charWidth)],image[:, 0:charWidth],image[:, (2*charWidth):]])

    ans = np.array ([vectoriseImg (charArray[i]) for i in range (3)])
    return ans

print ((getSegmentedVectors (df, 6).shape))

(3, 1, 16384)


Now create numpy matrices on which KMeans class object will cluster the rows. We create two numpy arrays, one for the operand vectors and one for the operator vectors. 

Note that we initialize the arrays with the required shape at the beginning itself. We should not do append to numpy arrays as they are stored in contiguous blocks of memory and whole array needs to be copied again and again in order to append. Source: https://stackoverflow.com/questions/568962/how-do-i-create-an-empty-array-matrix-in-numpy

I faced with a problem here - I cannot initialize 2 arrays as big as len(train_set) * vectorLength = ~ 45000 * 16000 =~ 10^9. My entire 16 GB ram was not enough and the laptop kept on freezing. Hence I chose the mini batch training which took mini batches of size 10 images at a time and trained the k means clustering model.



In [59]:
# our matrices on which mini batch KMeans clustering will work are declared. 
operators = np.empty (shape = [100, vectorLength], dtype = int)
operands = np.empty (shape = [200, vectorLength], dtype = int)

# now initialize KMeans class object. 
operatorCluster = MiniBatchKMeans(n_clusters = 4, random_state=0, batch_size = 100)
operandCluster = MiniBatchKMeans(n_clusters = 10, random_state=0, batch_size = 200)

for i in range (1,len(train_set)-100, 100):
    for j in range (i,i+100):
        segments = getSegmentedVectors (df, train_set[j])
        operators[j-i] , operands[j-i],operands[j-i + 100] = segments[0],segments[1],segments[2]
    operatorCluster = operatorCluster.partial_fit(operators)
    operandCluster = operandCluster.partial_fit(operands)
    
# now our we have trained our clustering model. This cell took approximately 190 seconds to run.

In [60]:
sorted_df = df.sort_values (by = ['Value'])
#print (sorted_df.head(5))

sorted_df.describe()

Unnamed: 0,Value
count,50000.0
mean,8.98578
std,14.079506
min,-9.0
25%,0.0
50%,5.0
75%,12.0
max,81.0


# Now first filter out all values which are negative. mark the cluster which has maximum of those operators as a subtraction symbol

-9 : 0 and 9 confirmed and - symbol confirmed
81 : 9 and * confirmed
above 18: multiply confirmed :

1 * 1 = 1
numbers predicted - 0, 1, 5, 7, 8, 9
25  
49
64
81

30 : 5 * 6  (6 predicted)
32 : 4 * 8 (4 predicted)
54 : 4 * 9 (4 predicted)
14 : 7 * 2 (2 predicted)

numbers remaining : 3



11
13
17



# Then, filter out all operator vectors for which image has value above 18. mark the cluster which has maximum of those operators as a multiplication symbol

In [61]:
# test_df = sorted_df.apply (lambda x : True if x['Value'] > 18 else False, axis = 1)
# print (test_df[sorted_df['Value'] > 18])
def getIndex(ss): 
    return int (ss[:-4])

count = [0,0,0,0]
for index, row in sorted_df.iterrows():     
    #if (row['Value'] > 18): 
    imgIndex = getIndex(row['Image']) 
    segments = getSegmentedVectors (df,imgIndex)
    count[operatorCluster.predict (segments[0])[0]] += 1

print (count)


[4820, 5315, 25910, 13955]


In [None]:
[4820, 5315, 25910, 13955]

In [64]:
num11 = df.apply(lambda x : True
            if x['Value'] == 11 else False, axis = 1)
num11 = len(df[num11 == True].index)

num17 = df.apply(lambda x : True
            if x['Value'] == 17 else False, axis = 1)
num17 = len(df[num17 == True].index)

num13 = df.apply(lambda x : True
            if x['Value'] == 13 else False, axis = 1)
num13 = len(df[num13 == True].index)

numNeg = df.apply(lambda x : True
            if x['Value'] > 18 else False, axis = 1)
numNeg = len(df[numNeg == True].index)

print (numNeg)

6150
