In [1]:
# this notebook processes the mat files and produces the processed test data files
# it extracts image path, age, gender, and the location of the face inside the image


import numpy as np
import matplotlib.pyplot as plt
import os
import scipy.io
import datetime
import pandas as pd
import json
from pymatreader import read_mat
#from tensorflow.keras.preprocessing import image
#import cv2

In [2]:
# read the .mat files
imdb_data = read_mat('./imdb.mat')['imdb']
wiki_data = read_mat('./wiki.mat')['wiki']

In [3]:
# function used to switch mat datetime format into normal date time
def matDate_to_datetime(num_days):
    fixed_days = int(num_days)-367
    temp = datetime.datetime(1, 1, 1)
    delta = datetime.timedelta(days=fixed_days)
    return temp+delta

# function used to derive the age from the given birthdate of the person in the image
# if given two full date values
def year_diff(d1, d2):
    later = max(d1, d2)
    earlier = min(d1, d2)
    result = later.year - earlier.year
    if later.month < earlier.month or (later.month == earlier.month and later.day < earlier.day):
        result -= 1
    return result

#if given just years, since the photo taken value is given in just year format, we will use this
def year_minus(d1, d2):
    result = d1 - d2
    return result

In [5]:
# process the imdb mat file extract - age, gender, and image path, and face location
PicsDict = []
counter = 0
corrupted = 0
corruptedList = {"wiki":[],"imbd": []}
for x in range(len(imdb_data["full_path"])):
    try:
        age = year_minus(imdb_data["photo_taken"][x],matDate_to_datetime(imdb_data["dob"][x]).year)
        if (str(imdb_data['face_score'][x]) == "-inf") or age > 130:
            corrupted+=1
            corruptedList["imbd"].append(imdb_data["full_path"][x])
            continue 
        PicsDict.append([str(imdb_data["gender"][x]),
                                         str(age),
                                              [int(y) for y in imdb_data["face_location"][x]],
                                                 imdb_data["full_path"][x],
                                                 str(imdb_data['face_score'][x])])
    except Exception as e:
        corruptedList["imbd"].append(imdb_data["full_path"][x])
        counter += 1
        
print("Imdb Bad DOB:",counter)
print("Imdb corrupted data entry:",corrupted)

with open('imdb_outputdata.json', 'w') as outfile:
    json.dump(PicsDict, outfile)

# process the wiki mat file  extract - age, gender, and image path , and face location
PicsDict = []
counter = 0
corrupted = 0
for x in range(len(wiki_data["full_path"])):
    try:
        age = year_minus(wiki_data["photo_taken"][x],matDate_to_datetime(wiki_data["dob"][x]).year)
        if (str(wiki_data['face_score'][x]) == "-inf") or age > 130:
            corrupted+=1
            corruptedList["wiki"].append(imdb_data["full_path"][x])
            continue 
        PicsDict.append([str(wiki_data["gender"][x]),
                                        str(age),
                                         [int(y) for y in wiki_data["face_location"][x]],
                                        wiki_data["full_path"][x],
                                        str(wiki_data['face_score'][x])])
    except Exception as e:
        corruptedList["wiki"].append(imdb_data["full_path"][x])
        counter += 1
        
print("wiki Bad DOB:",counter)
print("wiki corrupted data entry:",corrupted)

with open('wiki_outputdata.json', 'w') as outfile:
    json.dump(PicsDict, outfile)
          
#Create a file with all of the corrupted images so we can delete them later
with open('corrupted_outputdata.json', 'w') as outfile:
    json.dump(corruptedList, outfile)

Imdb Bad DOB: 129
Imdb corrupted data entry: 62289
wiki Bad DOB: 0
wiki corrupted data entry: 18030


In [None]:
def crop_image(image, face_loc):
    return image[face_loc[1]:face_loc[3], face_loc[0]:face_loc[2]]  #  1 , 3, 0, 2
face_loc = [int(y) for y in wiki_data["face_location"][0]]
print(face_loc)
img = cv2.imread(wiki_data["full_path"][0])
img = crop_image(img,face_loc)
plt.imshow(img)
plt.show()

In [None]:
import math
for x in range(25,200):
    print((imdb_data['face_score'][x]) != float('-inf'))

In [6]:
#Delete the corrupted images
import os
import json

JsonObj = json.load(open("corrupted_outputdata.json"))

for dataset in JsonObj:
    for file in JsonObj[dataset]:
        try:
            os.remove(str(file))
        except:
            pass
