In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# suppress backend warnings
%matplotlib tk
# change the backend according to your OS

# Algorithm

Let's build an algorithm to

- **Proximity detection**: Detect if you're too close or too far from the computer screen. This feature will **take care of your eyes**
- **Posture detection**: Detect if you're in a bad posture (not sitting correctly while using the computer, or the computer is in a bad place)
    + By checking if your face is within the camera range
    + By checking how much area of the camera size and screen size is your face occupying

## Proximity detection

This feature consists of 2 phases

- Camera callibration
- Distance calculation

### Camera callibration

This method of callibration is very simple and it is not very accurate.
But it may serve good for the purposes of the application.

The following image illustrates the basics of a pin-hole ideal camera (that's why this method is unnacurate, it does not take into account distortions or other stuff)

![Camera model](resources/cam_model.jpg)

**Note**: The calculated focal length $f$ may not be the **real** focal length. Remember, this algorithm is **not precise**.

From the image above, the only variables we may know are:

- $H_P$: Can be calculated by simply identifying the top and bottom of the projected rule
- $H_R$ and $d$: We can ask the user to take some pictures of his/her face (and also ask its face height or simply take the average 23cm) at some specified distances (just in the callibration phase).

With all that we can obtain the focal length by triangle similarity:

$
\begin{equation}
    \frac{f}{H_P} = \frac{d}{H_R}
\end{equation}
$

$
\begin{equation}
    f = \frac{H_P d}{H_R}
\end{equation}
$

But why do we want to compute the _focal length_ if we want to know the users' proximity/distance to the computer?

Because once we know the $f$ parameter, we can simply solve the equation for $d$:

$
\begin{equation}
    d = f \cdot \frac{H_R}{H_P}
\end{equation}
$

Or,

$
\begin{equation}
    f = d \cdot \frac{H_P}{H_R}
\end{equation}
$

Overall:

- Compute $f$ in the "callibration" phase.
- Later on, Use $f$ to calculate the distance from the camera to the face of the user.

### Implementation

OpenCV will only detect the face starting from the eyebrows to the end of the lips.

It will not detect the forehead, hair or chin, as show in the following figure

![face.jpg](resources/face.jpg)

The green rectangle encloses the "face" detected by OpenCV.

The average height of the rectangle in real life is $12cm$, but we can ask the user to give its unique value to increase the precision.

In [2]:
# Load global stuff
haar_cascade_face = cv2.CascadeClassifier("lbpcascade_frontalface_improved.xml")
face_height_cm = 12 # the user can change this value

In [3]:
# Just a util function to show and detect the user face
# This will wait till the user presses a key and then return the measurements of the detected face
def take_face_pic(video):
    detected_face_rect = []

    while True:
        _, frame = video.read()

        key_pressed = cv2.waitKey(1) & 0xFF
        can_quit = False

        # face detection
        gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces_rects = haar_cascade_face.detectMultiScale(gray_frame, scaleFactor=1.2, minNeighbors=5)
        
        if len(faces_rects) == 1:  # draw the rectangle
            detected_face_rect = faces_rects[0]
            (x, y, w, h) = faces_rects[0]
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            can_quit = True
        elif len(faces_rects) > 1:
            (frame_width, frame_height, _) = frame.shape
            cv2.putText(frame, "Multiple faces detected", (0, frame_height // 2 - 50), cv2.FONT_HERSHEY_PLAIN, 4,
                        (0, 0, 255))
        elif len(faces_rects) == 0:
            (frame_width, frame_height, _) = frame.shape
            cv2.putText(frame, "No face detected", (0, frame_height // 2 - 50), cv2.FONT_HERSHEY_PLAIN, 4,
                        (0, 0, 255))

        cv2.imshow("original", frame)

        if can_quit and (key_pressed == ord('q') or key_pressed == ord('m')):  # q: quit, m: measure
            break

    return detected_face_rect

### Distance calculation

The following code is the implementation of the algorithm. In summary the code will do the following:

- Take pictures at some specific distances to calculate the focal length $f = d \cdot \frac{H_P}{H_R}$. At each distance $f$ will be different so take the average value.
- Repeat that process to get more values of $f$ (with more values, more precise the value will be and therefore more precise will the results)

**Note 1**: This only works for values close to ~40cm. The real function is non-linear and the approximation is linear. But that is good because SpineWare will warn the user if he is too close (let's say < 50cm) to the screen

**Note 2**: Just remember $f$ is not the real focal length, it is just the ideal focal length.

In [4]:
def get_focal_length(video=None):
    video_is_none = video is None
    if video_is_none:
        video = cv2.VideoCapture(0)

    distances = [30, 40, 50, 60, 70]
    face_heights_px = []

    focal_lengths = []

    for i in range(3):
        for distance in distances:
            print("\n-----------")
            print(f"Take a picture of your face at a distance of {distance}cm from the camera")
            print("Then, select the top and the bottom of your face")

            (x, y, w, face_height_px) = take_face_pic(video)

            face_heights_px.append(face_height_px)

        # compute an average value for the focal length
        focal_length_values = [
            face_height_px * distance / face_height_cm for distance, face_height_px in zip(distances, face_heights_px)
        ]
        focal_length = sum(focal_length_values) / len(focal_length_values)

        focal_lengths.append(focal_length)

    if video_is_none:
        video.release()

    return sum(focal_lengths) / len(focal_lengths)

In [5]:
# Function to calculate the distance in real time
def real_time_distance(video, focal_length):
    while True:
        _, frame = video.read()

        key_pressed = cv2.waitKey(1) & 0xFF

        # face detection
        gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces_rects = haar_cascade_face.detectMultiScale(gray_frame, scaleFactor=1.2, minNeighbors=5)
        
        if len(faces_rects) == 1:  # draw the rectangle
            detected_face_rect = faces_rects[0]
            (x, y, w, h) = faces_rects[0]
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            distance = focal_length * face_height_cm / h
            cv2.putText(frame, str(distance // 1) + "cm", (20, 50), cv2.FONT_HERSHEY_PLAIN, 4, (255, 0, 0))
        elif len(faces_rects) > 1:
            (frame_width, frame_height, _) = frame.shape
            cv2.putText(frame, "Multiple faces detected", (0, frame_height // 2 - 50), cv2.FONT_HERSHEY_PLAIN, 4,
                        (0, 0, 255))
        elif len(faces_rects) == 0:
            (frame_width, frame_height, _) = frame.shape
            cv2.putText(frame, "No face detected", (0, frame_height // 2 - 50), cv2.FONT_HERSHEY_PLAIN, 4,
                        (0, 0, 255))

        cv2.imshow("original", frame)

        if key_pressed == ord('q') or key_pressed == ord('m'):  # q: quit, m: measure
            break

In [11]:
video = cv2.VideoCapture(0)

focal_length = 721#get_focal_length(video)
print(f"Focal length is: {focal_length}")

real_time_distance(video, 721)

video.release()
cv2.destroyAllWindows()

Focal length is: 721


With $f$ calculated we can substitute values in the equation

$
\begin{equation}
    d = f \frac{H_R}{H_P}
\end{equation}
$

Just as an example, I'll fill in the values the computed focal length $f$ of my camera.

$
\begin{equation}
    d = 721 \frac{H_R}{H_P}
\end{equation}
$

We can plot the function to see how it looks like

In [7]:
height_projected = np.linspace(50, 1080, 200) # from 50px to 1080px
real_height = np.linspace(8, 16, 200) # from 8cm to 16cm

X, Y = np.meshgrid(real_height, height_projected)

Z = 721 * X / Y # Z = distance

fig = plt.figure()
ax = plt.axes(projection='3d')
ax.contour3D(X, Y, Z, 200)
ax.set_title("Distance as a function of object's real height(cm) and height projected(px)")
ax.set_xlabel("Real Height (cm)")
ax.set_ylabel("Height projected (px)")
ax.set_zlabel("Distance (cm)")
plt.show()

As you can see from the plot, everything makes sense, but from the plot we can also conclude that the algorithm may not be accurate for very far away distances, or very close distances.

Since SpineWare does not need an extreme accuracy, it is ok to use this algorithm.

Actually with the `real_time_distance` you can do some tests and see how the algorithm gives correct values for distances between 30cm and ~80cm, but not for values out of that range

## Posture detection

Detect if you're in a bad posture (not sitting correctly while using the computer, or the computer is in a bad place)

- By checking if your face is within some boundaries
- By checking how much area of the camera size and screen size is your face occupying

### Check if the face is within boundaries

This is very simple, the following image illustrates the user having a **good posture**.

The black rectangle illustrates the camera size, blue lines are the boundaries.

![good posture](resources/posture_good.jpg)

And the following image illustrates the user having a **bad posture**

![good posture](resources/posture_bad.jpg)

Because the user's face is not within the set boundaries

In [8]:
# these values should be between 0.55 and 0.99
cam_width_acceptable_percentage = 0.9
cam_height_acceptable_percentage = 0.9

In [9]:
video = cv2.VideoCapture(0)

cam_width = round(video.get(cv2.CAP_PROP_FRAME_WIDTH))
cam_height = round(video.get(cv2.CAP_PROP_FRAME_HEIGHT))

# get the min and max coords
min_acceptable_x, max_acceptable_x = round((1 - cam_width_acceptable_percentage) * cam_width), round(cam_width_acceptable_percentage * cam_width)
min_acceptable_y, max_acceptable_y = round((1 - cam_height_acceptable_percentage) * cam_height), round(cam_height_acceptable_percentage * cam_height)

while True:
    _, frame = video.read()
    
    # draw the boundaries
    cv2.line(frame, (min_acceptable_x, 0), (min_acceptable_x, cam_height), (255, 0, 0), 5)
    cv2.line(frame, (max_acceptable_x, 0), (max_acceptable_x, cam_height), (255, 0, 0), 5)
    cv2.line(frame, (0, min_acceptable_y), (cam_width, min_acceptable_y), (255, 0, 0), 5)
    cv2.line(frame, (0, max_acceptable_y), (cam_width, max_acceptable_y), (255, 0, 0), 5)

    key_pressed = cv2.waitKey(1) & 0xFF

    # face detection
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces_rects = haar_cascade_face.detectMultiScale(gray_frame, scaleFactor=1.2, minNeighbors=5)

    if len(faces_rects) == 1:  # draw the rectangle
        detected_face_rect = faces_rects[0]
        (x, y, w, h) = faces_rects[0]
        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
        if x + w > max_acceptable_x or x < min_acceptable_x or y + h > max_acceptable_y or y < min_acceptable_y:
            cv2.putText(frame, "Face not in range", (0, frame_height // 2 - 50), cv2.FONT_HERSHEY_PLAIN, 4,
                    (50, 50, 255))
    elif len(faces_rects) > 1:
        (frame_width, frame_height, _) = frame.shape
        cv2.putText(frame, "Multiple faces detected", (0, frame_height // 2 - 50), cv2.FONT_HERSHEY_PLAIN, 4,
                    (0, 0, 255))
    elif len(faces_rects) == 0:
        (frame_width, frame_height, _) = frame.shape
        cv2.putText(frame, "No face detected", (0, frame_height // 2 - 50), cv2.FONT_HERSHEY_PLAIN, 4,
                    (0, 0, 255))

    cv2.imshow("original", frame)

    if key_pressed == ord('q') or key_pressed == ord('m'):  # q: quit, m: measure
        break

video.release()
cv2.destroyAllWindows()

This step is pretty simple and straightforward isn't it? I believe it requires no explaination

### Check if the face is not receiving too much light from the screen

![camera and screen sizes](resources/camera_screen_sizes.jpg)

$W_p$: Width projected in px

$W_c$: Width of the camera in px

$W_s$: Width of the screen in px

From the illustration above we can calculate number of pixels of the screen occupied by the face along the $x$ axis $W_r$, and the number of pixels of the screen occupied by the face along the $y$ axis $H_r$.

To obtain those values, we can simply find the ratio between the screen width and height between the projected width and height respectively $\frac{W_s}{W_p}$ and $\frac{H_s}{H_p}$.

For example, if $\frac{W_s}{W_p}$ is $1.6$ it means each px in the camera corresponds to $1.6$ pixels in the screen size. **note**: This is not a "realistic" correspondence between those measurements and it is useful just for this particular case.

$H_r$ and $W_r$ are inside the boundaries

$0 \leq W_r \leq W_s$

$0 \leq H_r \leq H_s$

With that we can obtain tell if the user is at a safe distance if $\frac{W_r}{W_s} \leq T_w$ and $\frac{H_r}{H_s} \leq T_h$

Where

- $T_h \in (0, 1)$: Height threshold
- $T_w \in (0, 1)$: Width threshold

In [10]:
video = cv2.VideoCapture(0)

cam_width = round(video.get(cv2.CAP_PROP_FRAME_WIDTH))
cam_height = round(video.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Obtain this values programatically
screen_width = 1280
screen_height = 720

cam_screen_height_ratio = screen_height / cam_height
cam_screen_width_ratio = screen_width / cam_width

width_thresh = 0.4
height_thresh = 0.4

while True:
    _, frame = video.read()

    key_pressed = cv2.waitKey(1) & 0xFF

    # face detection
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces_rects = haar_cascade_face.detectMultiScale(gray_frame, scaleFactor=1.2, minNeighbors=5)

    if len(faces_rects) == 1:  # draw the rectangle
        detected_face_rect = faces_rects[0]
        (x, y, w, h) = faces_rects[0]
        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
        
        face_screen_width_ratio = w * cam_screen_width_ratio / screen_width
        face_screen_height_ratio = h * cam_screen_height_ratio / screen_height
        
        if face_screen_width_ratio >= width_thresh or face_screen_height_ratio >= height_thresh:
            text_color = (0, 0, 255)
        else:
            text_color = (255, 255, 255)
        cv2.putText(frame, str(round(face_screen_width_ratio, 2)) + "% w, " + str(round(face_screen_height_ratio, 2)) + "%, h", (0, frame_height // 2 - 50), cv2.FONT_HERSHEY_PLAIN, 4, text_color)
    elif len(faces_rects) > 1:
        (frame_width, frame_height, _) = frame.shape
        cv2.putText(frame, "Multiple faces detected", (0, frame_height // 2 - 50), cv2.FONT_HERSHEY_PLAIN, 4,
                    (0, 0, 255))
    elif len(faces_rects) == 0:
        (frame_width, frame_height, _) = frame.shape
        cv2.putText(frame, "No face detected", (0, frame_height // 2 - 50), cv2.FONT_HERSHEY_PLAIN, 4,
                    (0, 0, 255))

    cv2.imshow("original", frame)

    if key_pressed == ord('q') or key_pressed == ord('m'):  # q: quit, m: measure
        break

video.release()
cv2.destroyAllWindows()

**LaTeX area**

$H_R$

$H_P$

$d$

$f$

$W_s$

$W_c$

$W_p$