In [1]:
import scipy
import pandas as pd
import numpy as np
import os
import torchvision.transforms as tt
import PIL
from tqdm import tqdm

In [2]:
imdb_mat_file = scipy.io.loadmat("C:/face_dataset/imdb_crop/imdb.mat")
imdb_data = imdb_mat_file['imdb'][0, 0]
print(imdb_data.dtype.names)

('dob', 'photo_taken', 'full_path', 'gender', 'name', 'face_location', 'face_score', 'second_face_score', 'celeb_names', 'celeb_id')


In [3]:
def create_df_from_mat(data, fields):

    df_dict = {}

    for field in fields:
        field_data = data[field].squeeze()
        
        if field_data.dtype.kind in {'O', 'U', 'S'}:
            processed = []
            for item in field_data:
                if isinstance(item, np.ndarray):
                    processed.append(str(item[0]) if item.size > 0 else "")
                else:
                    processed.append(str(item))
            field_data = np.array(processed)

        df_dict[field] = field_data

    df_from_mat = pd.DataFrame(df_dict)
    return df_from_mat

In [4]:
fields = ['full_path', 'name', 'face_score', 'second_face_score', 'celeb_id']
imdb_df = create_df_from_mat(imdb_data, fields)
imdb_df

Unnamed: 0,full_path,name,face_score,second_face_score,celeb_id
0,01/nm0000001_rm124825600_1899-5-10_1968.jpg,Fred Astaire,1.459693,1.118973,6488
1,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,Fred Astaire,2.543198,1.852008,6488
2,01/nm0000001_rm577153792_1899-5-10_1968.jpg,Fred Astaire,3.455579,2.985660,6488
3,01/nm0000001_rm946909184_1899-5-10_1968.jpg,Fred Astaire,1.872117,,6488
4,01/nm0000001_rm980463616_1899-5-10_1968.jpg,Fred Astaire,1.158766,,6488
...,...,...,...,...,...
460718,08/nm3994408_rm761245696_1989-12-29_2011.jpg,Jane Levy,3.845884,,8410
460719,08/nm3994408_rm784182528_1989-12-29_2011.jpg,Jane Levy,-inf,,8410
460720,08/nm3994408_rm926592512_1989-12-29_2011.jpg,Jane Levy,-inf,,8410
460721,08/nm3994408_rm943369728_1989-12-29_2011.jpg,Jane Levy,4.450725,,8410


In [8]:
clean_imdb = imdb_df[imdb_df['face_score'] > 0].copy()
clean_imdb.fillna({'second_face_score': 0}, inplace=True)
clean_imdb = clean_imdb[clean_imdb['second_face_score'] == 0]
clean_imdb

Unnamed: 0,full_path,name,face_score,second_face_score,celeb_id
3,01/nm0000001_rm946909184_1899-5-10_1968.jpg,Fred Astaire,1.872117,0.0,6488
4,01/nm0000001_rm980463616_1899-5-10_1968.jpg,Fred Astaire,1.158766,0.0,6488
6,02/nm0000002_rm1346607872_1924-9-16_2004.jpg,Lauren Bacall,3.479189,0.0,11516
7,02/nm0000002_rm1363385088_1924-9-16_2004.jpg,Lauren Bacall,3.870171,0.0,11516
12,02/nm0000002_rm221957120_1924-9-16_1974.jpg,Lauren Bacall,4.096431,0.0,11516
...,...,...,...,...,...
460715,08/nm3994408_rm73386752_1989-12-29_2011.jpg,Jane Levy,4.801987,0.0,8410
460716,08/nm3994408_rm744468480_1989-12-29_2011.jpg,Jane Levy,2.370091,0.0,8410
460718,08/nm3994408_rm761245696_1989-12-29_2011.jpg,Jane Levy,3.845884,0.0,8410
460721,08/nm3994408_rm943369728_1989-12-29_2011.jpg,Jane Levy,4.450725,0.0,8410


In [17]:
number_of_each_celebrity  = clean_imdb['celeb_id'].value_counts()
non_unique_celebs = number_of_each_celebrity[number_of_each_celebrity >= 1].index
clean_imdb = clean_imdb[clean_imdb['celeb_id'].isin(non_unique_celebs)]
clean_imdb = clean_imdb.sort_values('celeb_id')
clean_imdb

Unnamed: 0,full_path,name,face_score,second_face_score,celeb_id
327194,48/nm0946148_rm922142208_1959-10-23_2014.jpg,'Weird Al' Yankovic,2.533124,0.0,2
327171,48/nm0946148_rm1093319424_1959-10-23_1989.jpg,'Weird Al' Yankovic,2.156494,0.0,2
327172,48/nm0946148_rm1110096640_1959-10-23_1989.jpg,'Weird Al' Yankovic,1.455022,0.0,2
327173,48/nm0946148_rm1126873856_1959-10-23_1989.jpg,'Weird Al' Yankovic,2.047720,0.0,2
327174,48/nm0946148_rm1179500032_1959-10-23_2011.jpg,'Weird Al' Yankovic,1.685582,0.0,2
...,...,...,...,...,...
302868,82/nm1049982_rm2017378560_1975-5-4_2014.jpg,Óscar Jaenada,4.254282,0.0,20284
302873,82/nm1049982_rm376096256_1975-5-4_2014.jpg,Óscar Jaenada,3.780473,0.0,20284
302879,82/nm1049982_rm476759552_1975-5-4_2014.jpg,Óscar Jaenada,4.129909,0.0,20284
302883,82/nm1049982_rm865242368_1975-5-4_2010.jpg,Óscar Jaenada,4.616226,0.0,20284


In [18]:
dict_for_celeb_df = {"name" : [], "id" : [], "number_of_images" : []}
for celeb_id in clean_imdb['celeb_id'].unique():
    celeb_name = clean_imdb.iloc[(clean_imdb['celeb_id'] == celeb_id).argmax()].loc['name']
    dict_for_celeb_df['name'].append(celeb_name)
    dict_for_celeb_df['id'].append(celeb_id)
    dict_for_celeb_df['number_of_images'].append(number_of_each_celebrity[celeb_id])
celeb_names_df = pd.DataFrame(dict_for_celeb_df)
celeb_names_df

Unnamed: 0,name,id,number_of_images
0,'Weird Al' Yankovic,2,9
1,2 Chainz,3,2
2,50 Cent,4,62
3,A Martinez,5,1
4,A.J. Buckley,7,13
...,...,...,...
15490,Élodie Bouchez,20280,8
15491,Émilie Dequenne,20281,7
15492,Éric Caravaca,20282,4
15493,Ólafur Darri Ólafsson,20283,1


In [19]:
celeb_names_df.to_csv('data/large_celeb_names.csv', index=True)

In [20]:
celeb_names_list = celeb_names_df['name'].tolist()
celeb_names_list

["'Weird Al' Yankovic",
 '2 Chainz',
 '50 Cent',
 'A Martinez',
 'A.J. Buckley',
 'A.J. Cook',
 'A.J. Johnson',
 'A.J. Langer',
 'A.J. McLean',
 'A.R. Rahman',
 'AJ Bowen',
 'AJ Michalka',
 'Aaliyah',
 'Aamir Khan',
 'Aaron Abrams',
 'Aaron Ashmore',
 'Aaron Carter',
 'Aaron Christian Howles',
 'Aaron D. Spears',
 'Aaron Douglas',
 'Aaron Eckhart',
 'Aaron Farb',
 'Aaron Himelstein',
 'Aaron Jackson',
 'Aaron Jay Rome',
 'Aaron Katz',
 'Aaron Kwok',
 'Aaron Lohr',
 'Aaron Michael Metchik',
 'Aaron Paul',
 'Aaron Pedersen',
 'Aaron Ruell',
 'Aaron Russo',
 'Aaron Schwartz',
 'Aaron Seltzer',
 'Aaron Sorkin',
 'Aaron Spelling',
 'Aaron Stanford',
 'Aaron Taylor-Johnson',
 'Aaron Tveit',
 'Aaron Wolff',
 'Aaron Yoo',
 'Aasif Mandvi',
 'Abbas Kiarostami',
 'Abbey Lee',
 'Abbie Cornish',
 'Abby Brammell',
 'Abby Elliott',
 'Abdallah El Akal',
 'Abdellatif Kechiche',
 'Abderrahmane Sissako',
 'Abe Vigoda',
 'Abel Ferrara',
 'Abhay Deol',
 'Abhishek Bachchan',
 'Abi Morgan',
 'Abigail Breslin

In [21]:
with open ('../imdb_dataset/large_celeb_names.txt', 'w', encoding="utf-8") as file:
    file.writelines(f"{item}\n" for item in celeb_names_list)

In [25]:
def transform_image(input_image, new_image_size):
    transform =tt.Compose([
    tt.Resize(new_image_size),
    tt.CenterCrop(new_image_size)])
    return transform(input_image)

def save_image(file_path_end, sourse_dir, person_dir, image_index=1, new_image_size=256):
    full_file_path = sourse_dir + file_path_end
    image = PIL.Image.open(full_file_path).convert("RGB")
    resized_image = transform_image(image, new_image_size)
    resized_image.save(f"{person_dir}/{image_index}.jpg")


def make_person_directories(describe_df, sourse_dir, train_dir, test_dir):
    for new_celeb_id, celeb_id in tqdm(enumerate(describe_df['celeb_id'].unique())):
        person_dir_train = os.path.join(train_dir, str(new_celeb_id))
        os.makedirs(person_dir_train, exist_ok=True)
        person_dir_test = os.path.join(test_dir, str(new_celeb_id))
        os.makedirs(person_dir_test, exist_ok=True)

        person_data = describe_df[describe_df['celeb_id'] == celeb_id].sort_values(by='face_score', ascending=False)
        test_file_path = person_data['full_path'].iloc[0]
        save_image(test_file_path, sourse_dir, person_dir_test)

        for index in range(person_data.shape[0]):
            file_path_end = person_data['full_path'].iloc[index]
            save_image(file_path_end, sourse_dir, person_dir_train, index, new_image_size=256)

            if index == 100:
                break

In [26]:
sourse_directory = "C:/face_dataset/imdb_crop/"
train_directory = "C:/face_dataset/imdb_train_large/"
test_directory = "C:/face_dataset/imdb_test_large/"

make_person_directories(clean_imdb, sourse_directory, train_dir=train_directory, test_dir=test_directory)

15495it [19:21, 13.34it/s]
