# Data Creation

---

### Managing AFDB Data

In [None]:
import os
face_dest_arr = []
mask_dest_arr = []
path = "/AFDB_face_dataset"
for i in os.listdir(path):
    if i==".DS_Store":
            continue
    for x in os.listdir(path+"/"+i):
        if x==".DS_Store":
            continue
        face_dest_arr.append(path+"/"+i+"/"+x)

path = "/AFDB_masked_face_dataset"
for i in os.listdir(path):
    if i==".DS_Store":
            continue
    for x in os.listdir(path+"/"+i):
        if x==".DS_Store":
            continue
        mask_dest_arr.append(path+"/"+i+"/"+x)
group_mask = []
path = "single2-0"
for i in os.listdir(path):
    if i==".DS_Store":
            continue
    group_mask.append(path+"/"+i)
rwmfd_mask= []
path = "RWMFD_part_2_pro"
for i in os.listdir(path):
    if i==".DS_Store":
            continue
    for x in os.listdir(path+"/"+i):
        if x==".DS_Store":
            continue
        rwmfd_mask.append(path+"/"+i+"/"+x)
len(mask_dest_arr),len(face_dest_arr),len(rwmfd_mask),len(group_mask)

### Converting MaskPascalVOC Annotated Images to Face Images

In [None]:
import xml.etree.ElementTree as ET
filenames = []
labels = []
ymins = []
ymaxs = []
xmins = []
xmaxs = []

xml_path = "archive/annotations/"
images_path = "archive/images/"
for i in os.listdir(xml_path):
    if i==".DS_Store":
            continue
    tree = ET.parse(xml_path+i)
    root = tree.getroot()
    filename = root.find('filename').text
    for item in root.findall('./object'):
        label = item.find('name').text
        bndbox = item.find('bndbox')
        xmin = bndbox.find('xmin').text
        ymin = bndbox.find('ymin').text
        xmax = bndbox.find('xmax').text
        ymax = bndbox.find('ymax').text
        filenames.append(images_path+filename)
        labels.append(label)
        ymins.append(ymin)
        ymaxs.append(ymax)
        xmins.append(xmin)
        xmaxs.append(xmax)

import pandas as pd
df = pd.DataFrame({'filename':filenames,'label':labels,'ymin':ymins,'ymax':ymaxs,'xmin':xmins,'xmax':xmaxs})

import tqdm
import cv2
for i in tqdm.tqdm(range(len(df))):
    obj = df.iloc[i]
    im = cv2.imread(obj.filename)
    face = im[int(obj.ymin):int(obj.ymax),int(obj.xmin):int(obj.xmax)]
    cv2.imwrite(obj.label+"/"+str(i).zfill(5)+".jpg",face)


# Data Pre-Processing

---

### Extracting Faces from Images

In [None]:
from retinaface import RetinaFace

def write_faces(image_path,image_count,prefix):
    image = cv2.imread(image_path)
    face_count=0
    try:
        obj = RetinaFace.detect_faces(image_path)
        for key in obj:
            try:
                face_count+=1
                identity = obj[key]
                facial_area = identity["facial_area"]
                face = image[facial_area[1]: facial_area[3], facial_area[0]: facial_area[2]]
                cv2.imwrite("FACE-MASK/"+prefix+str(image_count).zfill(3)+"_"+str(face_count).zfill(3)+".jpg",face)
            except:
                pass
    except:
        pass

for image_count, image_path in enumerate(tqdm.tqdm(group_mask)):
    write_faces(image_path,image_count=image_count, prefix="group")

for image_count, image_path in enumerate(tqdm.tqdm(rwmfd_mask)):
    write_faces(image_path,image_count=image_count, prefix="rwmfd")

In [None]:
w_masked_images= []
w_masked_labels = []
masked_images= []
masked_labels = []
images= []
labels = []
with_mask = [ "with_mask"+"/"+i for i in os.listdir("with_mask") if not i.endswith(".DS_Store")]
without_mask = [ "without_mask"+"/"+i for i in os.listdir("without_mask") if not i.endswith(".DS_Store")]
masked_face = [ "FACE-MASK"+"/"+i for i in os.listdir("FACE-MASK") if not i.endswith(".DS_Store")]

### Merging all the Dataset

In [None]:
w_masked_images.extend(face_dest_arr)
w_masked_images.extend(without_mask)

masked_images.extend(mask_dest_arr)
masked_images.extend(masked_face)
masked_images.extend(with_mask)

w_masked_labels.extend([0]*len(face_dest_arr))
w_masked_labels.extend([0]*len(without_mask))

masked_labels.extend([1]*len(mask_dest_arr))
masked_labels.extend([1]*len(masked_face))
masked_labels.extend([1]*len(with_mask))

len(w_masked_images),len(masked_images),len(w_masked_labels),len(masked_labels)

#### Handling the Imbalanced Dataset

In [None]:
import random
index = random.sample(range(len(w_masked_images)), len(masked_images))
w_masked_images = [w_masked_images[i] for i in index]
w_masked_labels = [w_masked_labels[i] for i in index]

len(w_masked_images),len(masked_images),len(w_masked_labels),len(masked_labels)

In [None]:
images.extend(w_masked_images)
images.extend(masked_images)
labels.extend(w_masked_labels)
labels.extend(masked_labels)

In [None]:
import pandas as pd
df = pd.DataFrame({"image_path":images,"labels":labels})
df.to_csv("dataset.csv",index=False)

### Writing the dataset images to a singular location

In [None]:
import cv2
for i in range(len(df)):
    img = cv2.imread(df.image_path[i])
    if(df.labels[i]):
        cv2.imwrite("Mask-Dataset/Mask/"+str(i)+".jpg",img)
    else:
        cv2.imwrite("Mask-Dataset/No_Mask/"+str(i)+".jpg",img)

### Creating Dataset.csv

In [None]:
mask_d = [ "Mask-Dataset/Mask"+"/"+i for i in os.listdir("Mask-Dataset/Mask") if not i.endswith(".DS_Store")]
no_mask_d = [ "Mask-Dataset/No_Mask"+"/"+i for i in os.listdir("Mask-Dataset/No_Mask") if not i.endswith(".DS_Store")]

images = []
images.extend(mask_d)
images.extend(no_mask_d)
labels = []
labels.extend([1]*len(mask_d))
labels.extend([0]*len(no_mask_d))

print(len(images),len(labels))

import pandas as pd
df = pd.DataFrame({"image_path":images,"labels":labels})
df.to_csv("dataset.csv",index=False)

### Splitting Train and Test Files

In [None]:
import numpy as np
msk = np.random.rand(len(df)) < 0.8
ds = df.sample(frac=1)
train = ds[msk]
test = ds[~msk]

train.to_csv("train.csv",index=False)
test.to_csv("test.csv",index=False)