In [1]:
import pandas as pd
import os
from pathlib import Path

## Check whether a selfie is readable and landmarks are detected
There exists a script that produces a csv with all selfie paths and whether they are readable and if so it's landmarks (see src/process/greenlight_selfies.py).

In [None]:


def list_files_with_extension(directory, extension):
    files = []
    for filename in os.listdir(directory):
        if filename.endswith(extension):
            files.append(os.path.join(directory, filename))
    return files


def get_selfie_paths(save_dir: Path | str = "../../data/selfies"):
    selfies_dir = Path(save_dir)
    users_selfie_paths = []
    for user in selfies_dir.glob('*'):
        selfies = list_files_with_extension(user, '.jpg')
        users_selfie_paths.extend(selfies)
    return users_selfie_paths


def validate_selfie(img_path: Path | str):
    try:
        image = cv2.imread(img_path)
        if image is None:
            return img_path, False, "Unable to read the file as an image."
        else:
            return img_path, True, "File valid and can be read using cv2.imread()."
    except Exception as e:
        return img_path, False, f"Error: {str(e)}"
    

def validate_selfies(selfie_paths: list):
    validation_results = []
    with tqdm(desc="Validating selfies...", total=len(selfie_paths)) as pbar:
        with ThreadPoolExecutor(max_workers=40) as executor:
            futures = [
                executor.submit(
                    validate_selfie,
                    path
                )
                for path in selfie_paths
            ]
            for future in concurrent.futures.as_completed(futures):
                validation_results.append(future.result())
                pbar.update(1)
    return validation_results
# 

def get_facial_landmarks(image_path):
    try:
        # Initialize MediaPipe Face Mesh
        mp_face_mesh = mp.solutions.face_mesh
        face_mesh = mp_face_mesh.FaceMesh()

        # Read the image
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Error reading image at path: {image_path}")
        
        
        # Convert the image to RGB (MediaPipe Face Mesh requires RGB input)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Process the image and get face landmarks
    
        results = face_mesh.process(image_rgb)
        if results.multi_face_landmarks:
            landmarks_list = []
            for face_landmarks in results.multi_face_landmarks:
                landmarks = []
                for landmark in face_landmarks.landmark:
                    x, y, z = landmark.x, landmark.y, landmark.z
                    landmarks.append((x, y, z))
                landmarks_list.append(landmarks)
            del image
            return [image_path, landmarks_list[0], len(landmarks_list[0])]
    except:
        return None


# def get_all_landmarks(selfie_paths: list):
#     landmark_results = []
#     with tqdm(desc="Calculating landmarks...", total=len(selfie_paths)) as pbar:
#         with ThreadPoolExecutor(max_workers=40) as executor:
#             futures = [
#                 executor.submit(
#                     get_facial_landmarks,
#                     path
#                 )
#                 for path in selfie_paths
#             ]
#             for future in concurrent.futures.as_completed(futures):
#                 try:
#                     landmark_results.append(future.result())
#                 except:
#                     continue
#                 pbar.update(1)
#     return landmark_results


def process_image(image_path):
    try:
        result = get_facial_landmarks(image_path)
        return result
    except Exception as e:
        return f"Error processing image at path {image_path}: {str(e)}"

def get_all_landmarks(selfie_paths: list):
    landmark_results = []
    with tqdm(desc="Calculating landmarks...", total=len(selfie_paths)) as pbar:
        with ProcessPoolExecutor(max_workers=4) as executor:
            futures = [executor.submit(process_image, path) for path in selfie_paths]

            for future in as_completed(futures):
                try:
                    result = future.result()
                    if result is not None:
                        landmark_results.append(result)
                except Exception as e:
                    # Log or store information about the failed task
                    print(f"Error processing image: {str(e)}")

                pbar.update(1)

    return landmark_results

## Results of greenlight_selfies script

In [4]:
df_final = pd.read_csv('../../results/valid_selfies_w_landmark.csv')
df_final.head()

Unnamed: 0.1,Unnamed: 0,selfie_path,valid,error,landmarks,length
0,0,/home/azureuser/cloudfiles/code/Users/Franzisk...,True,File valid and can be read using cv2.imread().,"[(0.4631916880607605, 0.7439125776290894, -0.1...",478.0
1,1,/home/azureuser/cloudfiles/code/Users/Franzisk...,True,File valid and can be read using cv2.imread().,"[(0.5450685620307922, 0.7223218679428101, -0.0...",478.0
2,2,/home/azureuser/cloudfiles/code/Users/Franzisk...,True,File valid and can be read using cv2.imread().,"[(0.5130934715270996, 0.6828215718269348, -0.0...",478.0
3,3,/home/azureuser/cloudfiles/code/Users/Franzisk...,True,File valid and can be read using cv2.imread().,"[(0.5055005550384521, 0.6823481917381287, -0.0...",478.0
4,4,/home/azureuser/cloudfiles/code/Users/Franzisk...,True,File valid and can be read using cv2.imread().,"[(0.494416207075119, 0.6598946452140808, -0.11...",478.0


In [8]:
df_final.shape

(30660, 6)

In [10]:
{'number of readable selfies:': df_final.valid.sum()}

{'number of readable selfies:': 29786}

In [11]:
df_final['selfie'] = df_final['selfie_path'].apply(lambda x: Path(x).name)
df_final['user_id'] = df_final['selfie_path'].apply(lambda x: Path(x).parent.name)

In [None]:
# selfies of a specific user
df_final[df_final.user_id == '16071']

In [14]:
df_final['n_valid_of_user'] = df_final.groupby('user_id').valid.transform(lambda x: x.sum())
df_final['n_valid_landmarks_per_user'] = df_final.groupby('user_id').landmarks.transform(lambda x: x.notnull().sum())

In [15]:
# users 
df_complete_users = df_final[(df_final['n_valid_of_user']==3) & (df_final['n_valid_landmarks_per_user']==3)]
# df_complete_users.to_csv('user_completed.csv')
df_complete_users.user_id.nunique()

9318

In [17]:
# selfies that are readable but landmark detection failed
df_missing_landmarks = df_final[df_final.valid][df_final.landmarks.isnull()]
# df_missing_landmarks.to_csv('valid_selfies_missing_landmarks.csv')

  df_missing_landmarks = df_final[df_final.valid][df_final.landmarks.isnull()]


In [43]:
df_missing_landmarks['user_path'] = df_missing_landmarks.selfie_path.apply(lambda x : Path(x).parent.as_posix())

## Delete damaged selfies

In [42]:
# list of selfie path to be deleted
list_damaged_selfies = df_final[df_final.landmarks.isnull()].selfie_path.values
len(list_damaged_selfies)

919

In [43]:
def delete_files(file_paths):
    for file_path in file_paths:
        # Check for null characters in the file path
        if '\0' in file_path:
            print(f"Error: Null character found in file path: {file_path}")
            continue

        try:
            os.remove(file_path)
            print(f"Deleted: {file_path}")
        except OSError as e:
            print(f"Error deleting {file_path}: {e}")

In [45]:
delete_files(list_damaged_selfies)

Deleted: /home/azureuser/cloudfiles/code/Users/Franziska.Ahrens/git/digital-twins/src/data/selfies/10196/2020-06-16_10196_2020-06-16 22:00:00.jpg
Deleted: /home/azureuser/cloudfiles/code/Users/Franziska.Ahrens/git/digital-twins/src/data/selfies/10014/2023-07-06_10014_00001950-0016-8866-2931-F8F005C1C48C.jpg
Deleted: /home/azureuser/cloudfiles/code/Users/Franziska.Ahrens/git/digital-twins/src/data/selfies/10156/2020-06-16_10156_2020-06-16 09:00:00.jpg
Deleted: /home/azureuser/cloudfiles/code/Users/Franziska.Ahrens/git/digital-twins/src/data/selfies/10031/2021-03-19_10031_00000093-0016-1618-0288-F8F005C20C6C.jpg
Deleted: /home/azureuser/cloudfiles/code/Users/Franziska.Ahrens/git/digital-twins/src/data/selfies/10152/2020-06-16_10152_2020-06-16 22:00:00.jpg
Deleted: /home/azureuser/cloudfiles/code/Users/Franziska.Ahrens/git/digital-twins/src/data/selfies/10117/2023-05-22_10117_00000768-0016-8454-3402-F8F005C20830.jpg
Deleted: /home/azureuser/cloudfiles/code/Users/Franziska.Ahrens/git/digit