In [1]:
import scipy
import pandas as pd
import numpy as np
import os
import torchvision.transforms as tt
import PIL
from tqdm import tqdm

In [None]:
imdb_mat_file = scipy.io.loadmat("C:/face_dataset/imdb_crop/imdb.mat")
imdb_data = imdb_mat_file['imdb'][0, 0]
print(imdb_data.dtype.names)

In [3]:
def create_df_from_mat(data, fields):

    df_dict = {}

    for field in fields:
        field_data = data[field].squeeze()
        
        if field_data.dtype.kind in {'O', 'U', 'S'}:
            processed = []
            for item in field_data:
                if isinstance(item, np.ndarray):
                    processed.append(str(item[0]) if item.size > 0 else "")
                else:
                    processed.append(str(item))
            field_data = np.array(processed)

        df_dict[field] = field_data

    df_from_mat = pd.DataFrame(df_dict)
    return df_from_mat

In [None]:
fields = ['full_path', 'name', 'face_score', 'second_face_score', 'celeb_id']
imdb_df = create_df_from_mat(imdb_data, fields)
imdb_df

In [None]:
clean_imdb = imdb_df[imdb_df['face_score'] > 0].copy()
clean_imdb.fillna({'second_face_score': 0}, inplace=True)
clean_imdb = clean_imdb[clean_imdb['second_face_score'] == 0]
clean_imdb

In [None]:
number_of_each_celebrity  = clean_imdb['celeb_id'].value_counts()
non_unique_celebs = number_of_each_celebrity[number_of_each_celebrity >= 5].index
clean_imdb = clean_imdb[clean_imdb['celeb_id'].isin(non_unique_celebs)]
clean_imdb = clean_imdb.sort_values('celeb_id')
clean_imdb

In [None]:
dict_for_celeb_df = {"name" : [], "id" : [], "number_of_images" : []}
for celeb_id in clean_imdb['celeb_id'].unique():
    celeb_name = clean_imdb.iloc[(clean_imdb['celeb_id'] == celeb_id).argmax()].loc['name']
    dict_for_celeb_df['name'].append(celeb_name)
    dict_for_celeb_df['id'].append(celeb_id)
    dict_for_celeb_df['number_of_images'].append(number_of_each_celebrity[celeb_id])
celeb_names_df = pd.DataFrame(dict_for_celeb_df)
celeb_names_df

In [11]:
celeb_names_df.to_csv('data/celeb_names.csv', index=True)

In [None]:
celeb_names_list = celeb_names_df['name'].tolist()
celeb_names_list

In [15]:
with open ('../imdb_dataset/celeb_names.txt', 'w', encoding="utf-8") as file:
    file.writelines(f"{item}\n" for item in celeb_names_list)

In [16]:
def transform_image(input_image, new_image_size):
    transform =tt.Compose([
    tt.Resize(new_image_size),
    tt.CenterCrop(new_image_size)])
    return transform(input_image)

def save_image(file_path_end, sourse_dir, person_dir, image_index=1, new_image_size=256):
    full_file_path = sourse_dir + file_path_end
    image = PIL.Image.open(full_file_path).convert("RGB")
    resized_image = transform_image(image, new_image_size)
    resized_image.save(f"{person_dir}/{image_index}.jpg")


def make_person_directories(describe_df, sourse_dir, train_dir, test_dir):
    for new_celeb_id, celeb_id in tqdm(enumerate(describe_df['celeb_id'].unique())):
        person_dir_train = os.path.join(train_dir, str(new_celeb_id))
        os.makedirs(person_dir_train, exist_ok=True)
        person_dir_test = os.path.join(test_dir, str(new_celeb_id))
        os.makedirs(person_dir_test, exist_ok=True)

        person_data = describe_df[describe_df['celeb_id'] == celeb_id].sort_values(by='face_score', ascending=False)
        test_file_path = person_data['full_path'].iloc[0]
        save_image(test_file_path, sourse_dir, person_dir_test)

        for index in range(1, person_data.shape[0]):
            file_path_end = person_data['full_path'].iloc[index]
            save_image(file_path_end, sourse_dir, person_dir_train, index, new_image_size=256)

            if index == 100:
                break

In [27]:
sourse_directory = "C:/face_dataset/imdb_crop/"
train_directory = "C:/face_dataset/imdb_train_newindex/"
test_directory = "C:/face_dataset/imdb_test_newindex/"

make_person_directories(clean_imdb, sourse_directory, train_dir=train_directory, test_dir=test_directory)

6380it [14:13,  7.48it/s]
