# **Exploratory Data Analysis (EDA)**

The Face Mask dataset contains 853 images belonging to the 3 classes, as well as their bounding boxes in the PASCAL VOC format. With this dataset, it is possible to create a model to detect people wearing masks, not wearing them, or wearing masks improperly.

The classes are:

- With mask;
- Without mask;
- Mask worn incorrectly.

In [1]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import ast
import os

In [None]:
folder_path = f"dataset/images"
images_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
folder_path = f"dataset/annotations"
annotation_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
print(f"There are {len(images_files)} images and {len(annotation_files)} annotation files")

In [3]:

columns = ['img_name','id','boxes', 'label','width','height']
dataset = pd.DataFrame(columns=columns)

for i, f in enumerate(annotation_files):
  path=f"dataset/annotations/"+f
  tree = ET.parse(path)
  root = tree.getroot()
  img_name = root.find('filename').text
  id=img_name.split('.')[0]
  image_width = int(root.find('size/width').text)
  image_height = int(root.find('size/height').text)

  # Extraire les dÃ©tails de chaque objet
  objects = []
  labels=[]
  for obj in root.findall('object'):

    object_label = obj.find('name').text
    bbox = obj.find('bndbox')
    object_bbox = {
        'xmin': int(bbox.find('xmin').text),
        'ymin': int(bbox.find('ymin').text),
        'xmax': int(bbox.find('xmax').text),
        'ymax': int(bbox.find('ymax').text)
    }
    labels.append(object_label)
    objects.append(object_bbox)

  data_tuple = (img_name,id, objects,labels,image_width,image_height)
  dataset.loc[i] = data_tuple


In [None]:
dataset.head()

In [None]:
TRAIN_PATH=f'dataset/images/'
dataset['path'] = dataset.apply(lambda row: TRAIN_PATH + str(row.img_name), axis=1)
# Get image level labels
dataset.head()

In [None]:
def map_values_to_labels(input_list, label_mapping):
    return [label_mapping[value] for value in input_list]

label_mapping = {
    'with_mask': 0,
    'without_mask': 1,
    'mask_weared_incorrect': 2
}

def map_values_to_labels(input_list, label_mapping):
    return [label_mapping[value] for value in input_list]
def mapping(row):
    label=(row.label)
    return map_values_to_labels(label, label_mapping)
dataset['numeric_labels'] = dataset.apply(lambda row: mapping(row), axis=1)
dataset['n_annotations'] = dataset['boxes'].apply(len)
dataset['has_annotations'] = dataset['n_annotations'] > 0
dataset['has_2_or_more_annotations'] = dataset['n_annotations'] >= 2
dataset['doesnt_have_annotations'] = dataset['n_annotations'] == 0
dataset.head()

In [None]:
df_split  = dataset.groupby("img_name").agg({'has_annotations': 'max'}).astype(int).reset_index()
df_split.head()

In [8]:
def analize_split(df_train, df_val, df):
     # Analize results
    print(f"   Train images                 : {len(df_train) / len(df):.3f}")
    print(f"   Val   images                 : {len(df_val) / len(df):.3f}")
    print()
    print(f"   Train images with annotations: {len(df_train[df_train['has_annotations']]) / len(df[df['has_annotations']]):.3f}")
    print(f"   Val   images with annotations: {len(df_val[df_val['has_annotations']]) / len(df[df['has_annotations']]):.3f}")
    print()
    print(f"   Train mean annotations       : {df_train['n_annotations'].mean():.3f}")
    print(f"   Val   mean annotations       : {df_val['n_annotations'].mean():.3f}")
    print()

In [None]:
!mkdir train-validation-split

In [None]:
# Train validation split
from sklearn.model_selection import train_test_split
for test_size in [0.05,0.1,0.2]:
    print(f"Generating train-validation split with {test_size*100}% validation")
    df_train_idx, df_val_idx = train_test_split(df_split['img_name'], stratify=df_split["has_annotations"], test_size=test_size, random_state=42)
    dataset['is_train'] = dataset['img_name'].isin(df_train_idx)
    df_train, df_val = dataset[dataset['is_train']], dataset[~dataset['is_train']]

    analize_split(df_train, df_val, dataset)

    # Save to file
    f_name = f"train-validation-split/train-{test_size}.csv"
    print(f"Saving file to {f_name}")
    dataset.to_csv(f_name, index=False)
    print()

In [None]:
#Show example 
from yolo_utils import load_image, show_img
dataset=pd.read_csv('train-0.1.csv')
i=99
img=load_image(dataset['path'].iloc[i])
bounding_boxes=ast.literal_eval(dataset['boxes'].iloc[i])
bboxes=list([[box['xmin'], box['ymin'], box['xmax'], box['ymax']] for box in bounding_boxes])
display(show_img(img, bboxes=bboxes,names=ast.literal_eval(dataset['label'].iloc[i]),labels=ast.literal_eval(dataset['numeric_labels'].iloc[i]),confs=None, bbox_format='voc_pascal',show_classes = True))

In [None]:
#Show example detected by Yolo
from yolo_utils import load_image, show_img, predict
import torch
ckpt_path = 'weights/face_mask_yolov5l'
model = torch.hub.load('yolov5','custom',path=ckpt_path,source='local',force_reload=True)
conf      = 0.5
iou       = 0.5
model.conf = conf  # NMS confidence threshold
model.iou  = iou  # NMS IoU threshold

path = "yolo_dataset/images/valid/maksssksksss696.png"
img=load_image(path)
bboxes, confis,names,labels = predict(model, img, size=640,)
display(show_img(img, bboxes=bboxes,names=names,labels=labels,confs=None, bbox_format='voc_pascal',show_classes = True))