In [102]:
import scipy.io
import pandas as pd
import numpy as np
import os
import torchvision.transforms as tt
import PIL

In [2]:
imdb_mat_file = scipy.io.loadmat("C:/face_dataset/imdb_crop/imdb.mat")
print(imdb_mat_file.keys())

dict_keys(['__header__', '__version__', '__globals__', 'imdb'])


In [3]:
imdb_data = imdb_mat_file['imdb'][0, 0]
print(imdb_data.dtype.names)

('dob', 'photo_taken', 'full_path', 'gender', 'name', 'face_location', 'face_score', 'second_face_score', 'celeb_names', 'celeb_id')


In [4]:
def create_df_from_mat(data, fields):

    df_dict = {}

    for field in fields:
        field_data = data[field].squeeze()
        
        if field_data.dtype.kind in {'O', 'U', 'S'}:
            processed = []
            for item in field_data:
                if isinstance(item, np.ndarray):
                    processed.append(str(item[0]) if item.size > 0 else "")
                else:
                    processed.append(str(item))
            field_data = np.array(processed)

        df_dict[field] = field_data

    df_from_mat = pd.DataFrame(df_dict)
    return df_from_mat

In [103]:
fields = ['full_path', 'name', 'face_score', 'second_face_score', 'celeb_id']
imdb_df = create_df_from_mat(imdb_data, fields)
imdb_df

Unnamed: 0,full_path,name,face_score,second_face_score,celeb_id
0,01/nm0000001_rm124825600_1899-5-10_1968.jpg,Fred Astaire,1.459693,1.118973,6488
1,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,Fred Astaire,2.543198,1.852008,6488
2,01/nm0000001_rm577153792_1899-5-10_1968.jpg,Fred Astaire,3.455579,2.985660,6488
3,01/nm0000001_rm946909184_1899-5-10_1968.jpg,Fred Astaire,1.872117,,6488
4,01/nm0000001_rm980463616_1899-5-10_1968.jpg,Fred Astaire,1.158766,,6488
...,...,...,...,...,...
460718,08/nm3994408_rm761245696_1989-12-29_2011.jpg,Jane Levy,3.845884,,8410
460719,08/nm3994408_rm784182528_1989-12-29_2011.jpg,Jane Levy,-inf,,8410
460720,08/nm3994408_rm926592512_1989-12-29_2011.jpg,Jane Levy,-inf,,8410
460721,08/nm3994408_rm943369728_1989-12-29_2011.jpg,Jane Levy,4.450725,,8410


In [107]:
clean_imdb = imdb_df[imdb_df['face_score'] > 0].copy()
clean_imdb.fillna({'second_face_score': 0}, inplace=True)
clean_imdb = clean_imdb[clean_imdb['second_face_score'] == 0]
clean_imdb

Unnamed: 0,full_path,name,face_score,second_face_score,celeb_id
3,01/nm0000001_rm946909184_1899-5-10_1968.jpg,Fred Astaire,1.872117,0.0,6488
4,01/nm0000001_rm980463616_1899-5-10_1968.jpg,Fred Astaire,1.158766,0.0,6488
6,02/nm0000002_rm1346607872_1924-9-16_2004.jpg,Lauren Bacall,3.479189,0.0,11516
7,02/nm0000002_rm1363385088_1924-9-16_2004.jpg,Lauren Bacall,3.870171,0.0,11516
12,02/nm0000002_rm221957120_1924-9-16_1974.jpg,Lauren Bacall,4.096431,0.0,11516
...,...,...,...,...,...
460715,08/nm3994408_rm73386752_1989-12-29_2011.jpg,Jane Levy,4.801987,0.0,8410
460716,08/nm3994408_rm744468480_1989-12-29_2011.jpg,Jane Levy,2.370091,0.0,8410
460718,08/nm3994408_rm761245696_1989-12-29_2011.jpg,Jane Levy,3.845884,0.0,8410
460721,08/nm3994408_rm943369728_1989-12-29_2011.jpg,Jane Levy,4.450725,0.0,8410


In [None]:
def transform_image(input_image, new_image_size):
    transform =tt.Compose([
    tt.Resize(new_image_size),
    tt.CenterCrop(new_image_size)])
    return transform(input_image)

def save_image(file_path_end, sourse_dir, target_dir, celeb_id, image_index, new_image_size):
    full_file_path = sourse_dir + file_path_end
    image = PIL.Image.open(full_file_path).convert("RGB")
    resized_image = transform_image(image, new_image_size)
    resized_image.save(f"{target_dir}{celeb_id}/{image_index}.jpg")


def make_person_directories(describe_df, sourse_dir, target_dir):
    for celeb_id in describe_df['celeb_id'].unique():
        person_dir = os.path.join(target_dir, str(celeb_id))
        os.makedirs(person_dir, exist_ok=True)
        person_data = describe_df[describe_df['celeb_id'] == celeb_id]

        for index in range(1, person_data.shape[0] + 1):
            file_path_end = person_data['full_path'].iloc[index - 1]
            save_image(file_path_end, sourse_dir, target_dir, celeb_id, index, new_image_size=256)

In [94]:
sourse_directory = "C:/face_dataset/imdb_crop/"
target_directory = "C:/face_dataset/faces_from_imdb3/"

make_person_directories(clean_imdb, sourse_directory, target_directory)

KeyboardInterrupt: 

In [95]:
jim_df = clean_imdb[clean_imdb['celeb_id'] == 1358]

In [96]:
jim_df[80:]

Unnamed: 0,full_path,name,face_score,second_face_score,celeb_id
4915,04/nm0000104_rm657102848_1960-8-10_2003.jpg,Antonio Banderas,3.216419,0.0,1358
4917,04/nm0000104_rm67088896_1960-8-10_2011.jpg,Antonio Banderas,4.521055,0.0,1358
4919,04/nm0000104_rm685553920_1960-8-10_2011.jpg,Antonio Banderas,3.317839,0.0,1358
4920,04/nm0000104_rm703961344_1960-8-10_1998.jpg,Antonio Banderas,0.741446,0.0,1358
4921,04/nm0000104_rm713201664_1960-8-10_2004.jpg,Antonio Banderas,2.802747,0.0,1358
4927,04/nm0000104_rm787847424_1960-8-10_1998.jpg,Antonio Banderas,2.298362,0.0,1358
4929,04/nm0000104_rm797880832_1960-8-10_2011.jpg,Antonio Banderas,3.238349,0.0,1358
4930,04/nm0000104_rm851807232_1960-8-10_2007.jpg,Antonio Banderas,1.775926,0.0,1358
4935,04/nm0000104_rm886880512_1960-8-10_2011.jpg,Antonio Banderas,3.677286,0.0,1358
4936,04/nm0000104_rm894801920_1960-8-10_1995.jpg,Antonio Banderas,1.794547,0.0,1358
