In [4]:
from keras.utils import to_categorical  #transform the 1d output array consisting of integer labels to 2d array. each integer is converted to 1d array. eg: [0,1,2] is converted to [100,010,001].
from keras_preprocessing.image import load_img #load image in PIL form (Python Imaging Library) for processing. gray scale possible here
from keras.models import Sequential #images are processed through layers. this is used to initiate sequential layers
from keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D #different layers for image to go through
""" 
IN ORDER
1. conv2d : does convolution operations on input/image by applying filters/kernels to it to extract features : edges/textures etc.
            I/P: [[0,1,2,3],[1,2,3,4],[2,3,4,5],[3,4,5,6]]
            Our kernel/filter is a smaller matrix which will move over this I/P matrix and compute the dot product among the overlaps and sum those dot products to get current configuration answer. this way, we get a 'feature map'
            Kernel: [[1,0],[0,0]]
            Feature map outputted : [[0,1,2],[1,2,3],[2,3,4]]
2. maxpooling2d : reduces the spatial dimensions and hence the computation by taking the max value in a window called pool size
                  If pool size is 2*2 (default),then maximum value taken among all 2*2 NON OVERLAPPING MATRICES
                  O/P from feature map : [[2,3],[3,4]]
3. dropout : random drop a fraction (0 to 1) of connections to prevent overfitting - taking noise as well to perform great in training data and trash in new data due to overreliance on particular set of neurons
             Will randomly drop some values from O/P of max pooling layer. eg: if 0.5 is given as parameter, then 50% values dropped
             O/P of this can be: [[2,0],[0,4]]
4. flatten : flattens multi dimensional tensor to 1d tensor/array. So, convolution layer -> fully connected layer where each layer neuron is connected to every previous layer neuron. just concatenated each row
             O/P: [2,0,0,4]
5. dense : each input neuron is connected to every output neuron. that is, classification happens on the basis of extracted features 
           The dense layer is a fully connected layer. every layer neuron is connected to a previous layer neuron. 
           So, number of parameters/labels : number of neurons in dense layer
           Every neuron in P (O/P from flatten layer) is attached to every single dense layer neuron with an associated weight.
           So, every dense layer neuron has a weighted sum: sum(all neurons in P)[weight of edge*(value of neuron in P)+bias factor for that dense layer neuron]
           Now, finally, an activiation function is applied to every dense layer neuron: ai = f(wi), where f can be RELU etc.
           O/P is the 1d vector [ai] i from 1 to N (no. of neurons in dense layer)
"""
import os #for accessing directories
import pandas as pd #for dataframe
import numpy as np #for converting grayscale PIL image to numpy array
from sklearn.preprocessing import LabelEncoder #for converting output strings to integers
from tqdm.notebook import tqdm #for progress bar

In [4]:
#setting the directories from where the model will be trained and tested
train_dir = 'images/train'
test_dir = 'images/test'

In [5]:
def createdf(dir):
    x=[] #image paths
    y=[] #labels of images = moods
    for label in os.listdir(dir):  #listdir is used to access every file/subfolder directly within the root directory
        for image in os.listdir(os.path.join(dir,label)):  #path.join is used as we need to access the subdirectory
            x.append(os.path.join(dir,label,image))
            y.append(label)
    return x,y

In [6]:
#creating the dataframes of train and test: column 1=image path, column 2=mood
train=pd.DataFrame()
test=pd.DataFrame()
train['image'],train['label']=createdf(train_dir)
test['image'],test['label']=createdf(test_dir)

In [18]:
def extract_features(images):
    features = [] 
    """
    Basically, now we will iterate over all images and add their 'features' into the list. every image will have own feature
    The feature will be a 48*48 matrix as every image is 48*48 pixels. 
    Further, every image will be inputted in grayscale mode (black/white colour). So, this is a single channel image. That is, to 
    describe a pixel of this image, only a single integer is necessary (the intensity of that pixel : 0 means black and 255 means black)
    On the other hand, in a coloured image, we describe intensity of R,G and B colours. So, this is 3 channel image.
    
    So, every image is a 48*48 matrix where every pixel is denoted by a single integer from 0 to 255
    That is, every image is a 48*48*1 matrix
    """
    for image in tqdm(images):   #tqdm is used for progress bar
        img = load_img(image,color_mode='grayscale')  #load the image. here image is PIL object. grayscale is used
        img = np.array(img)  #convert the image into a 48*48 matrix
        features.append(img) #add this matrix to features
    features = np.array(features) #since features was an array, convert it into np array for further modification
    features = features.reshape(len(features),48,48,1) #now, convert each image in feature into 48*48*1 form as single channel. len(features) iis called batch size
    return features

In [19]:
train_features=extract_features(train['image'])
train_features.shape

  0%|          | 0/28821 [00:00<?, ?it/s]

(28821, 48, 48, 1)

In [20]:
test_features=extract_features(test['image'])
test_features.shape

  0%|          | 0/7066 [00:00<?, ?it/s]

(7066, 48, 48, 1)

In [22]:
# divide by 255 so that every number lies between 0 to 1 for numerical stability and better training
x_train = train_features/255.0
x_test = test_features/255.0

In [23]:
"""
Label encoder is used to convert labels which are strings to integers.
eg: if a,b,c,d are labels/outputs then it is converted 0,1,2,3
In a label encoder, the words are mapped from 0 to number_of_unique_words-1
Fitting is used first so that label encoder can map the labels to integers. after this, any other combination of same labels can be mapped easily
eg: [a,b,c,d] can be used for fitting to give a->0,b->1 and so on
    [a,b,b] can be now mapped as [0,1,1].
"""
le = LabelEncoder()
le.fit(train['label'])

In [24]:
y_train = le.transform(train['label']) #convert training labels using fitted label encoder
y_test = le.transform(test['label']) #convert testing labels using fitted label encoder
"""
Now, one hot encoding is used using to_categorical
if y_train is [0,1,2] it is converted to [[1,0,0],[0,1,0],[0,0,1]].
num_classes will denote the number of integers used to denote every integer. in above case, 3 integers are used.
num_classes >= number of distinct labels. as then only, one hot encoding can be used.
if bigger, then padding with zeroes needed
"""
y_train = to_categorical(y_train,num_classes = 7)
y_test = to_categorical(y_test,num_classes = 7)

In [26]:
model = Sequential() #initiates the model so that a stack of layers can be added to this model using add() function
"""
Conv2d layers are applied first. The number of filters go from 128->256->512->512. That is because, CNN is based on hierarchial
learning. Lower layers learn basic features like edges,textures etc. and upper layers learn more complex patterns by combining
these. 
Further, number of filters here means number of 3x3 matrices which will independently go over the image. all these independent
results will be stacked to create 3d convolution vector
"""
"""
RELU: Rectified Linear unit. It is the activation function which works on the neurons
ReLU(x)=max(0,x) -> Introduces non linearity -> non continuity.
So, firstly, 128 filters independetly of size 3x3 are used each of which after computation apply ReLU activation on its neurons.
eg: if current sum of dot products is negative, 0 is placed there.
"""
"""
Multiple sets of conv2d,maxpool and dropout used as each set will build upon previous output allowing it to learn complex 
patterns and relationships among bigger features
"""
# convolutional layers
model.add(Conv2D(128, kernel_size=(3,3), activation='relu', input_shape=(48,48,1))) #128 filters, kernel size is 3x3. RELU activation. Image size is 48*48*1 (number of pixels in length*in height*number of channels).
model.add(MaxPooling2D(pool_size=(2,2))) #2x2 window used in max pooling layer
model.add(Dropout(0.4)) #40% neurons set to 0

model.add(Conv2D(256, kernel_size=(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.4))

model.add(Conv2D(512, kernel_size=(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.4))

model.add(Conv2D(512, kernel_size=(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.4))

model.add(Flatten())
"""
After flattening the output from previous layers into 1d tensor/array, fully connected layers are used where each layer neuron is connected to all neurons of previous layer
First, a dense layer with 512 neurons is used, that is, this layer will output probabilities of 512 parameters/labels
Then, 256 neurons used (lesser) so that output of previous layer can be further improved. dropout value is reduced so that 
most output can be used to make decisions.
ReLU is used for non-linearity
"""
# fully connected layers
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))
# output layer
model.add(Dense(7, activation='softmax'))
"""
Finally, output layer is used which takes as input the output of the last hidden layer
It has 7 neurons denoting the 7 labels/outputs we need
This layer will calculate the logits (raw scores) for all the 7 classes/labels
Softmax activation will convert these scores into probabilities (sum to 1).
"""

In [28]:
"""
Compile is used to configure the learning process
ADAM (Adaptive Moment Estimation) adjusts learning rates (gradient rate) of each parameter for improved convergence
Categorical CrossEntropy calculates the difference/loss between prediced output (from softmax probability) and actual output (one hot encoded output)
Accuracy is used to show how many predicted labels are correct so that the model can be adjusted properly
"""
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'] )

In [29]:
"""
x_train and y_train is used to train the model. 128 images are fed to model at once before updating parameters. 100 iterations
are run over the whole dataset. the validation data is used to see the current accuracy on unseen data

1. An epoch starts
2. Calculates the y_pred using x_train
3. Compares y_train and y_pred and calculates loss using categorical_crossentropy
4. Using calculated loss, its gradient is calculated using backward propagation
5. Parameters are updated using ADAM
6. epoch ends
7. validation data used to see current accuracy
8. can be stopped in between to prevent overfitting
"""
model.fit(x= x_train,y = y_train, batch_size = 128, epochs = 100, validation_data = (x_test,y_test)) 

Epoch 1/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m314s[0m 1s/step - accuracy: 0.2389 - loss: 1.8394 - val_accuracy: 0.2583 - val_loss: 1.8143
Epoch 2/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 1s/step - accuracy: 0.2554 - loss: 1.8065 - val_accuracy: 0.2620 - val_loss: 1.7832
Epoch 3/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m309s[0m 1s/step - accuracy: 0.2696 - loss: 1.7570 - val_accuracy: 0.3446 - val_loss: 1.6429
Epoch 4/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 1s/step - accuracy: 0.3403 - loss: 1.6503 - val_accuracy: 0.4258 - val_loss: 1.4972
Epoch 5/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 1s/step - accuracy: 0.3886 - loss: 1.5547 - val_accuracy: 0.4558 - val_loss: 1.3998
Epoch 6/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m319s[0m 1s/step - accuracy: 0.4252 - loss: 1.4796 - val_accuracy: 0.4890 - val_loss: 1.3328
Epoch 7/100
[1m

KeyboardInterrupt: 

In [31]:
model_json = model.to_json() #converts model architecture (layers,settings but not weighted parameters) to JSON string
with open("emotiondetector.json",'w') as json_file: 
    json_file.write(model_json)
model.save("emotiondetector.h5") #saves both architecture and weights(parameters) in a HDF5 file (suitable for numeric) data



In [1]:
from keras.models import model_from_json
json_file = open("emotiondetector.json", "r")
model_json = json_file.read()
json_file.close()
model = model_from_json(model_json)
model.load_weights("emotiondetector.h5")

In [7]:
label = ['angry','disgust','fear','happy','neutral','sad','surprise'] #only provide in alphabetical order

In [5]:
def ef(image):
    img = load_img(image,grayscale =  True )
    feature = np.array(img)                              #convert image to feature map of size 1x48x48x1
    feature = feature.reshape(1,48,48,1)
    return feature/255.0

In [9]:
image = 'images/test/disgust/533.jpg'
print("original image is of disgust")
img = ef(image)
pred = model.predict(img)  #gives an array of probabilities pertaining to each class/label
pred_label = label[pred.argmax()]  #pred.argmax() gives the index of the highest probability
print("model prediction is ",pred_label)

original image is of disgust
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
model prediction is  disgust
