## Image Processing and Modeling 

### Goal: Use machine learning to predict facial expressions 

    Objectives:
    1) Read image data and store in dataframe 
    2) Reduce size of images and grayscale to lower data points
    3) Try to isolate humna face
    4) Run three classification models: 
        - Random Forest
        - XGClassifier
        - Support vector machine
    5) See whether data augmentation has any positive affect on the models

In [46]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from skimage.filters import prewitt_h,prewitt_v
from sklearn import svm
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold

import random
from scipy import ndarray
import skimage as sk
from skimage import transform
from skimage import util


import cv2
import os
from PIL import Image
import imgaug as ia
import imgaug.augmenters as iaa
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
import pickle

### Dataframe Creation

    Reading Image Directory and Info into dataframe

In [47]:
# Image Data is broken down into 2 groups, each with 35 male and 35 female particpants
# Each particpant make 7 different facial expresions, and have 5 different head angles 
# for each expression ranging from 90 degress to the left to 90 degress to the right

# Each image file is marked with each of the following catergories
# These lists are created to loop through the image names and create a dataframe from them
Group = ['A','B']
Gender = ['F','M']
Identity = ['01','02','03','04','05','06','07','08','09','10',
            '11','12','13','14','15','16','17','18','19','20',
            '21','22','23','24','25','26','27','28','29','30',
            '31','32','33','34','35']
Expression = ['AF','AN','DI','HA','NE','SA','SU']
Angle = ['FL','HL','S','HR','FR']


In [48]:
# looping through all files to create a dataframe

compile_list = []

for group in Group:
    for gender in Gender:
        for identity in Identity:
            for expression in Expression:
                for angle in Angle:
                    # file is directory of image, use format in combination with the loops to call upon 
                    # every image file
                    file = "/Users/Cianan/Downloads/KDEF_and_AKDEF/KDEF/{}{}{}/{}{}{}{}{}.JPG"
                    stitch_file = file.format(group,gender,identity,
                                              group,gender,identity,
                                              expression,angle)
                    # pic iden is a list created to store all info about a given image
                    pic_iden = [group,gender,identity,expression,angle,stitch_file]
                    compile_list.append(pic_iden)

In [49]:
# dataframe is created
image_df = pd.DataFrame(compile_list, columns = ['Group','Gender',
                                                 'Identity','Expression',
                                                 'Head_Angle','Image_Directory'])
image_df.head()

Unnamed: 0,Group,Gender,Identity,Expression,Head_Angle,Image_Directory
0,A,F,1,AF,FL,/Users/Cianan/Downloads/KDEF_and_AKDEF/KDEF/AF...
1,A,F,1,AF,HL,/Users/Cianan/Downloads/KDEF_and_AKDEF/KDEF/AF...
2,A,F,1,AF,S,/Users/Cianan/Downloads/KDEF_and_AKDEF/KDEF/AF...
3,A,F,1,AF,HR,/Users/Cianan/Downloads/KDEF_and_AKDEF/KDEF/AF...
4,A,F,1,AF,FR,/Users/Cianan/Downloads/KDEF_and_AKDEF/KDEF/AF...


In [50]:
# there are some images that are of unequal size or that are missing
# this is used to sort that out 

bad_data = []
for count in range(image_df.shape[0]):
    image = cv2.imread(image_df.Image_Directory[count])
    # try statement cathes missing images
    try: 
        # if statement catches images that arent 781 by 581
        if len(image) != 762:
            bad_data.append(count)
    except:
        bad_data.append(count)

In [51]:
# removes all missing images and images of differnt size from dataframe
# there is only 4 images being sorted out
image_sorted = image_df.drop(bad_data).reset_index()

In [109]:
#image = cv2.imread(image_df.Image_Directory[1])

### Image feature selection and Variable Creation for machine learning models

    - Creating dependent and independent variables for train test
    - Processing images so that they can be read by our models

In [53]:
# creating y variables
# changing from string to int so that they can be processed by XGClassifier
dependent = []

for i in range(len(image_sorted)):
    if image_sorted.Expression[i] == 'AF':
        dependent.append(0)
    elif image_sorted.Expression[i] == 'AN':
        dependent.append(1)
    elif image_sorted.Expression[i] == 'DI':
        dependent.append(2)
    elif image_sorted.Expression[i] == 'HA':
        dependent.append(3)
    elif image_sorted.Expression[i] == 'NE':
        dependent.append(4)
    elif image_sorted.Expression[i] == 'SA':
        dependent.append(5)
    elif image_sorted.Expression[i] == 'SU':
        dependent.append(6)

In [54]:
# creating x variables

independent = []
for i in range(len(image_sorted)):
    # reads image and grayscales, so each pixel goes from an array of 3 to 1
    image = cv2.imread(image_sorted.Image_Directory[i],cv2.IMREAD_GRAYSCALE)
    # reduces size of image fro, 562x762 to 281x381 reducing the number of pixel by 4
    img = cv2.resize(image, (281, 381))
    # croping sides of image that are empty space, so just the face is showing
    crop = iaa.Crop(px=(30))
    img = crop.augment_image(img)
    img = img.T
    # turns to 1D array so the models can process the image
    img = img.ravel()
    independent.append(img)

### Running Models

In [34]:
# These are functoins created for the purpose of Data Augmentation 

def rotation(img):
    rot_deg = random.uniform(-30, 30)
    img1 = sk.transform.rotate(img, rot_deg)
    return img1

"""
Rotation introduces a random rotation from -30 to 30 degress into the image
"""
    
def noise(img):
    img1 = sk.util.random_noise(img)
    return img1
    
"""
Noise introduces random noise into the image
"""
    
def flip(img):
    flip_hr=iaa.Flipud(p=1.0)
    img1= flip_hr.augment_image(img)
    return img1

"""
Flip cuases the image to flip upside down
"""

def scale(img):
    scale_im=iaa.Affine(scale={"x": (1.5, 1.0), "y": (1.5, 1.0)})
    img1 =scale_im.augment_image(img)
    return img1

"""
scale changes the scale of the x and y axis by some number scalar between 1.5 and 1 
"""

'\nscale changes the scale of the x and y axis by some number scalar between 1.5 and 1 \n'

In [95]:
# splits data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(independent, dependent,test_size=0.2, random_state=42)

# used to save train and test sets for use in other notebooks

# pickle.dump(x_train, open('x_trainc', 'wb'))
# pickle.dump(x_test, open('x_testc', 'wb'))
# pickle.dump(y_train, open('y_trainc', 'wb'))
# pickle.dump(y_test, open('y_testc', 'wb'))

In [56]:
len(x_train[0])

107061

### Classification Models

In [18]:
def models_scores(metric, data_aug, identifer, x_train, y_train, x_test, y_test):
    
    # Checks if i want to use data augmentation
    if data_aug == 'yes':
        for i in range(len(x_train)):
            # reshapes array from 1d to 2d so that I can apply changes to image
            img = np.reshape(x_train[i], (281, 381))
            # randomly chooses function to transform image
            key = random.choice([rotation, scale, flip, noise])
            image = key(img)
            image = image.T
            image = image.ravel()
            x_train.append(image)
        # this is for later in the code so that y_train doesnt double in size 
        # every time data augmentation is used
        reshape_y = len(y_train)
        y_train = y_train + y_train   
    
    if metric == 'xgb': # runs XGClassifier
        
        # XGC needs data to be in an array 
        y_train = np.array(y_train)
        y_test = np.array(y_test)
        x_train = np.array(x_train)
        x_test = np.array(x_test)
        
        boost = XGBClassifier(XGBClassifier(max_depth=4, learning_rate=0.8, n_estimators=100, 
                                            objective="multi:softprob", verbose = 1))
        
        eval_set = [(x_train,y_train),(x_test,y_test)] 
        results = boost.fit(x_train, y_train, eval_set = eval_set, verbose=True, early_stopping_rounds=5)
        
    elif metric == 'rf': # runs random forest
        
        rfc = RandomForestClassifier(n_estimators = 400, max_features= 'sqrt', random_state = 42, verbose = 1,n_jobs=-1)
        results = rfc.fit(x_train, y_train)
        accuracy = accuracy_score(results.predict(x_test), y_test)
        return('Accuracy:', accuracy) 
        
    elif metric == 'svm': # runs support vector machine
        
        lin_svm = svm.NuSVC(random_state = 42, verbose = 1, probability = True)
        results = lin_svm.fit(x_train,y_train)
        accuracy = accuracy_score(results.predict(x_test), y_test)
        return('Accuracy:', accuracy)
    
    if data_aug == 'yes': # reset y_train to original length
        del y_train[-n:]

    # creates filename and saves model 
    file = metric + '_' + data_aug + '_' + identifer
    pickle.dump(results, open(file, 'wb'))
    
    """
    Function specifies what model I want to run, whether or not I want to use data augmentation
    and saves the model when done 
    """
        

In [19]:
"""
When using model scoresThe first input is 
    'rf' if you want random forest
    'xgb' if you want XGClassifier
    'svm' if you want support vector machine
    
Second input can be 'yes' if you want data augmentation
    or anything else if you don't but 'no' is prefered for naming convention
    
Third input is used at the end of the file name to the model when its going to be saved,
so if you make a change you can put 'no crop' 

The last four inputs are just the training and test data but these dont need to be touched
"""

models_scores('rf', 'no', 'crop4_cv', x_train, y_train, x_test, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:   59.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    0.1s finished


('Accuracy:', 0.610204081632653)

### Model Improvement

In [53]:
# Uses grid search to find best models
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=100, oob_score = True, min_samples_split = 2, min_samples_leaf = 1 ) 

param_grid = { 
    'min_samples_split': [5, 10],
    "min_samples_leaf": [1, 2, 4]
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(x_train, y_train)
print(CV_rfc.best_params_)


{'min_samples_leaf': 4, 'min_samples_split': 5}


In [94]:
# Used to test different hyperparameters for XGClassifier
xgb = XGBClassifier(max_depth=4, learning_rate=0.8, n_estimators=100, objective="multi:softprob", verbose = 1)

In [120]:
eval_set=[(np.array(x_train),np.array(y_train)),(np.array(x_test),np.array(y_test))] #tracking train/validation error as we go
dok = xgb.fit(np.array(x_train), np.array(y_train),eval_set = eval_set, verbose=True, early_stopping_rounds=5)

Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-merror:0.45735	validation_1-merror:0.58163
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.35010	validation_1-merror:0.55816
[2]	validation_0-merror:0.26379	validation_1-merror:0.53061
[3]	validation_0-merror:0.20352	validation_1-merror:0.50510
[4]	validation_0-merror:0.15424	validation_1-merror:0.48775
[5]	validation_0-merror:0.12283	validation_1-merror:0.48571
[6]	validation_0-merror:0.09091	validation_1-merror:0.45510
[7]	validation_0-merror:0.06665	validation_1-merror:0.44286
[8]	validation_0-merror:0.04648	validation_1-merror:0.44184
[9]	validation_

## CNN

In [110]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import plot_model
from tensorflow.keras import datasets, layers, models


from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras.layers import BatchNormalization, Flatten

from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [112]:
#image_sorted.Expression
# define example
values = array(image_sorted.Expression)
#print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)
# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
print(inverted)

[0 0 0 ... 6 6 6]
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
['AF']


In [113]:
x_train, x_test, y_train, y_test = train_test_split(independent, onehot_encoded,test_size=0.2, random_state=42)

In [117]:
model = Sequential()
# hidden layer
# model.add(Conv2D(32, kernel_size = (3, 3), activation='relu', input_shape=(107061,1)))
# model.add(MaxPooling2D(pool_size=(2,2)))
# model.add(BatchNormalization())
# model.add(Conv2D(64, kernel_size=(3,3), activation='relu'))
# model.add(MaxPooling2D(pool_size=(2,2)))
# model.add(BatchNormalization())
# model.add(Conv2D(64, kernel_size=(3,3), activation='relu'))
# model.add(MaxPooling2D(pool_size=(2,2)))
# model.add(BatchNormalization())
# model.add(Conv2D(96, kernel_size=(3,3), activation='relu'))
# model.add(MaxPooling2D(pool_size=(2,2)))
# model.add(BatchNormalization())
# model.add(Conv2D(32, kernel_size=(3,3), activation='relu'))
# model.add(MaxPooling2D(pool_size=(2,2)))
# model.add(BatchNormalization())
# model.add(Dropout(0.2))
model.add(Dense(10000, input_shape=(107061,), activation='relu'))
model.add(Dense(10000, activation='relu'))
# output layer
#model.add(Flatten())
model.add(Dense(7, activation='softmax'))

In [118]:
model.summary()

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_17 (Dense)             (None, 10000)             1070620000
_________________________________________________________________
dense_18 (Dense)             (None, 10000)             100010000 
_________________________________________________________________
dense_19 (Dense)             (None, 7)                 70007     
Total params: 1,170,700,007
Trainable params: 1,170,700,007
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
# training the model for 10 epochs
x_test1= np.array(x_test)
y_test1= np.array(y_test)
x_train1= np.array(x_train)
y_train1= np.array(y_train)

# y_train1 = tf.keras.utils.to_categorical(y_train1, 7)
# y_test1 = tf.keras.utils.to_categorical(y_test1, 7)
# x_train1 = tf.keras.utils.to_categorical(y_train1, 7)
# x_test1 = tf.keras.utils.to_categorical(y_test1, 7)
model.fit(x_train1, y_train1, epochs=20, validation_data=(x_test1, y_test1))

Epoch 1/20
