Task 1:



In [10]:
import os
import numpy as np
from PIL import Image
import time

In [11]:
def process_file(file_path, processed_file_path, target_size = 256, aspect_ratio_thres = 1.3):
    with Image.open(file_path) as image:
        # crop out the longer dimension so it's square
        width, height = image.size
        new_size = min(width, height)
        if new_size * aspect_ratio_thres < max(width, height):
            # print(f'skipped due to aspect ratio too high')
            return 1
        left = (width - new_size) / 2
        top = (height - new_size) / 2
        right = (width + new_size) / 2
        bottom = (height + new_size) / 2
        image = image.crop((left, top, right, bottom))
        
        # resize
        image = image.resize((target_size, target_size))

        # convert the image to RGB
        image = image.convert('RGB')
        
        # convert to numpy array
        img_array = np.array(image)

        # print(f'processed array shape: {img_array.shape}')
        if img_array.shape != (target_size, target_size, 3):
            print(f'error! wrong shape {img_array.shape}')
        
        # save to file
        np.save(processed_file_path, img_array)
        return 0

In [12]:
def extract_5_features(data):
    features = np.zeros((5,))
    return features

In [13]:
def preprocess(raw_data_dir, processed_data_dir):
    # Create the processed_data directory if it doesn't exist
    if not os.path.exists(processed_data_dir):
        os.makedirs(processed_data_dir)

    # List all subfolders in raw_data
    for subdir in os.listdir(raw_data_dir):
        print(f'reading directory {subdir}')
        raw_subdir_path = os.path.join(raw_data_dir, subdir)
        
        # Check if it's a directory
        if not os.path.isdir(raw_subdir_path):
            continue
        processed_subdir_path = os.path.join(processed_data_dir, subdir)
        
        # Create the subfolder in processed_data if it doesn't exist
        if not os.path.exists(processed_subdir_path):
            os.makedirs(processed_subdir_path)
        
        count = 0
        processed_count = 0
        total = len(os.listdir(raw_subdir_path))
        # Process each file in the subdirectory
        for filename in os.listdir(raw_subdir_path):
            file_path = os.path.join(raw_subdir_path, filename)
            processed_file_path = os.path.join(processed_subdir_path, filename)
            
            # Process the file
            return_code = process_file(file_path, processed_file_path)
            count += 1
            if return_code == 0:
                processed_count += 1
            if count % 100 == 0 or count == total:
                print(f'traversed {count} / {total}, processed {processed_count}')
                
    print('finished preprocessing')

In [16]:
def import_features(processed_data_dir, features, features_extraction_function, folders=None):
    total_items = 0
    # List all subfolders
    for subdir in os.listdir(processed_data_dir):
        if folders is not None and subdir not in folders:
            continue
        subdir_path = os.path.join(processed_data_dir, subdir)
        
        # Check if it's a directory
        if not os.path.isdir(subdir_path):
            continue
        total_items += len(os.listdir(subdir_path))
    print(f'found {total_items} items!')
        

    X = np.zeros((total_items, features))
    Y = np.zeros((total_items, 1))
    category_map = dict()

    category_counter = 0
    item_counter = 0

    # List all subfolders
    for subdir in os.listdir(processed_data_dir):
        if folders is not None and subdir not in folders:
            continue
        print(f'reading directory {subdir}')
        subdir_path = os.path.join(processed_data_dir, subdir)
        
        # Check if it's a directory
        if not os.path.isdir(subdir_path):
            continue
        
        # Process each file in the subdirectory
        for filename in os.listdir(subdir_path):
            file_path = os.path.join(subdir_path, filename)
            data = np.load(file_path)
            features = features_extraction_function(data)
            X[item_counter] = features
            Y[item_counter][0] = category_counter
            item_counter += 1
        
        category_map[category_counter] = subdir
        category_map[subdir] = category_counter
        category_counter += 1

    print('finished feature import')

    return X, Y, category_map

In [17]:
raw_data_dir = '../../Data/Art'
processed_data_dir = '../../Data/Art_Processed'
folders = ['Baroque', 'Expressionism', 'Japanese', 'Realism', 'Symbolism']

#preprocess(raw_data_dir, processed_data_dir)
X, Y, category_map = import_features(processed_data_dir, 5, extract_5_features, folders)
print(f'X shape: {X.shape}\nY Shape: {Y.shape}\nMap: \n{category_map}\n')

found 6371 items!
reading directory Baroque
reading directory Expressionism
reading directory Japanese
reading directory Realism
reading directory Symbolism
finished feature import
X shape: (6371, 5)
Y Shape: (6371, 1)
Map: 
{0: 'Baroque', 'Baroque': 0, 1: 'Expressionism', 'Expressionism': 1, 2: 'Japanese', 'Japanese': 2, 3: 'Realism', 'Realism': 3, 4: 'Symbolism', 'Symbolism': 4}

