In [2]:
!pip install -qU scipy tables pandas seaborn matplotlib torch torchvision basicsr realesrgan 

In [7]:
import scipy.io
from tables import *
import pandas as pd
import os
import shutil
from pathlib import Path
import tqdm
from tqdm.notebook import tqdm
from directify import directify
from random import sample
from PIL import Image

In [4]:
tqdm.pandas()

In [8]:
data = scipy.io.loadmat('imdb.mat')
lean = data['imdb'][0][0]

def recruit_images(df,size):
    """ lean is an array of arrays from the imdb_crop.tar file from the IMDB faces dataset. 
         Size is the minimum number of images from each class (name).
         Writes images to a "./selected/" directory."""
    SIZE = size
    names = df["Name"].tolist()
    unique = df["Name"].unique()
    paths = df["Path"].tolist()
    joined = list(zip(names,paths))
    
    for name in tqdm(unique):
        #print(name)
        valid = [item[1] for item in joined if item[0] == name]
        # print("valid",valid)
        #print(valid[:10])
        sampled_valid_data = [item for item in sample(valid,SIZE)]
        # print(sampled_valid_data[:10])
        directify(sampled_valid_data)
    print("Recruited!")

def add_image_dimensions(df, image_path_column):
    """
    Takes a DataFrame and a column name containing image paths, adds two new columns
    with the width and height of each image.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The input DataFrame containing image paths
    image_path_column : str
        The name of the column containing image file paths
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame with two new columns:
        - '{image_path_column}_width': Width of each image in pixels
        - '{image_path_column}_height': Height of each image in pixels
        
    Example:
    --------
    >>> df = pd.DataFrame({'image_path': ['path/to/image1.jpg', 'path/to/image2.jpg']})
    >>> df = add_image_dimensions(df, 'image_path')
    >>> print(df['image_path_width'], df['image_path_height'])
    0    800    600
    1    1024   768
    """
    
    def get_dimensions(row):
        path=row["Path"]
        try:
            with Image.open(path) as img:
                return img.size  # Returns (width, height)
        except Exception as e:
            print(f"Error processing image at {path}: {str(e)}")
            return (None, None)
    
    # Create new column names
    width_column = f"{image_path_column}_width"
    height_column = f"{image_path_column}_height"
    
    # Get dimensions as tuples
    dimensions = df.progress_apply(func=get_dimensions,axis=1)
    print(dimensions)
    # Split dimensions into separate columns
    df[width_column] = [item[0] for item in dimensions]
    df[height_column] = [item[1] for item in dimensions]
    return df

In [5]:
names = [item[0] for item in lean[4][0]]
paths = [item[0] for item in lean[2][0]]
joined = list(zip(names,paths))
counts = pd.Series([item[0] for item in joined]).value_counts()
freq = counts[counts >= 50].index.tolist()
frequent_fliers = [item for item in joined if item[0] in freq]

df = pd.DataFrame({
    "Name":[item[0] for item in lean[4][0]],
    "Path":["./imdb_crop/"+item[0] for item in lean[2][0]],
    "Gender":lean[3][0],
    "Confidence_1":lean[6][0],
    "Confidence_2":lean[7][0],
                  })

In [6]:
confident_df = df[(df['Confidence_1']>=0.0)]
conf_count = confident_df["Name"].value_counts()
frequent_confident = conf_count[conf_count >= 50].index.tolist()
len(df[df["Name"].isin(frequent_confident)]),len(frequent_confident)
#While the overall corpus seems sufficiently large, I am concerned with a sub-100,000 images dataset. 
#I would rather use a larger but less precise dataset.

(298777, 1975)

In [7]:
df_dim = add_image_dimensions(df,"Path")
df_dim.to_csv("imdb_crop_dimensions.csv",index=False)

  0%|          | 0/460723 [00:00<?, ?it/s]

0         (257, 257)
1         (263, 263)
2         (500, 500)
3         (400, 401)
4         (340, 340)
             ...    
460718    (157, 157)
460719    (333, 500)
460720    (354, 500)
460721    (292, 292)
460722    (111, 111)
Length: 460723, dtype: object


In [8]:
df_dim = pd.read_csv("imdb_crop_dimensions.csv")
big_images = df_dim[(df_dim["Path_height"]>100) & (df_dim["Path_width"]>100)]
big_counts = big_images["Name"].value_counts()
freq = big_counts[big_counts >= 50].index.tolist()
freq_big_images = big_images[big_images["Name"].isin(freq)]
len(freq_big_images),len(freq)

(272378, 2055)

In [9]:
recruit_images(freq_big_images,50)

  0%|          | 0/2055 [00:00<?, ?it/s]


Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to selected

Copied 50 files successfully to s

In [16]:
len(freq_big_images[(freq_big_images["Path_height"]>300) & (freq_big_images["Path_width"]>300)]) / len(freq_big_images)

0.5034180440417361

In [39]:
from pathfxn import get_file_paths

scaled = get_file_paths("./scaled/")
scaled[0][9:]
base = []
training = []
for elem in scaled:
    base += [elem[9:]]
df_dim['Clean_Path'] = df_dim.apply(lambda x: x["Path"][15:],axis=1)
base_df = df_dim[df_dim["Clean_Path"].isin(base)].copy()
base_df.apply(lambda x: training.append((x["Name"],x["Clean_Path"])),axis=1)
names = base_df["Name"].unique()
other_imgs = df_dim[(df_dim["Name"].isin(names))&(~df_dim["Clean_Path"].isin(base))]
testing = []
for name in names:
    person = df_dim[df_dim["Name"]==name]
    if len(person) < 10:
        sampled = person.sample(len(person))
    else:
        sampled = person.sample(10)
    sampled.apply(lambda x: testing.append((x["Name"],x["Path"])),axis=1)
testing[:10],training[:10]

([('Marlon Brando', './imdb_crop/08/nm0000008_rm2140966400_1924-4-3_1963.jpg'),
  ('Marlon Brando', './imdb_crop/08/nm0000008_rm2349567488_1924-4-3_1990.jpg'),
  ('Marlon Brando', './imdb_crop/08/nm0000008_rm1439728896_1924-4-3_1972.jpg'),
  ('Marlon Brando', './imdb_crop/08/nm0000008_rm3772758528_1924-4-3_1994.jpg'),
  ('Marlon Brando', './imdb_crop/08/nm0000008_rm2514467328_1924-4-3_1994.jpg'),
  ('Marlon Brando', './imdb_crop/08/nm0000008_rm4242453248_1924-4-3_1990.jpg'),
  ('Marlon Brando', './imdb_crop/08/nm0000008_rm1906085376_1924-4-3_1963.jpg'),
  ('Marlon Brando', './imdb_crop/08/nm0000008_rm1956417024_1924-4-3_1963.jpg'),
  ('Marlon Brando', './imdb_crop/08/nm0000008_rm3626613248_1924-4-3_1976.jpg'),
  ('Marlon Brando',
   './imdb_crop/08/nm0000008_rm2191825152_1924-4-3_1996.jpg')],
 [('Marlon Brando', 'nm0000008_rm1221625088_1924-4-3_1972.jpg'),
  ('Marlon Brando', 'nm0000008_rm1238402304_1924-4-3_1972.jpg'),
  ('Marlon Brando', 'nm0000008_rm1271956736_1924-4-3_1972.jpg'),
 

In [40]:
len(testing),len(training)

(20550, 102750)

In [41]:
import json
json_test = dict()
for i in testing:
    json_test[i[1]] = i[0]
json.dump(json_test,open("Test.json","w"))
json_train = dict()
for i in training:
    json_train[i[1]] = i[0]
json.dump(json_train,open("Train.json","w"))
    