# Import

In [1]:
import sys
from pathlib import Path
import os 
sys.path.append('/home/gridsan/vyuan/.local/lib/python3.9/site-packages/')

current_path = Path.cwd()

# Go to top of the root and append
root = current_path.parents[4]
sys.path.append(str(root))

os.chdir(root)

In [2]:
import os
import random
from pathlib import Path
import json
import shutil


## Code function

In [34]:
def create_original_data_and_metadata(verbs, path_imsitu, path_dataset):
    original_metadata = json.load(open(path_imsitu / 'metadata' / 'full.json'))
    path_original_data = path_imsitu / 'original_data'

    selected_targets = {}

    for verb in verbs:
        selected_targets[verb] = original_metadata[verb]

    target_concepts_count = {}
    metadata = {}
    metadata_by_gender = {}

    list_male = ['male', 'man']
    list_female = ['female', 'woman']
    list_gender = ['male', 'man', 'female', 'woman']

    # Loop through verbs
    for verb in selected_targets:

        target_concepts_count[verb] = {}
        metadata[verb] = {}
        metadata_by_gender[verb] = {'male': {}, 'female': {}}

        path_dest = path_dataset / 'original' / verb
        os.makedirs(path_dest / 'male')
        os.makedirs(path_dest / 'female')

        for image_id, original_metadata in selected_targets[verb].items():

            # If agent is gendered, add the metadata and the concepts
            if any(name in original_metadata['agent'] for name in list_gender):

                image_agent = original_metadata['agent']
                image_concepts = original_metadata['concepts']
                metadata[verb][image_id] = original_metadata 

            for concept in image_concepts:
                if concept not in target_concepts_count[verb]:
                    target_concepts_count[verb][concept] = 1
                else:
                    target_concepts_count[verb][concept] += 1

            # Depending on agent gender, copy metadata and file
            if any(name in original_metadata['agent'] for name in list_female):
                metadata_by_gender[verb]['female'][image_id] = original_metadata
                shutil.copy(path_original_data / image_id, path_dest / 'female')

            elif any(name in original_metadata['agent'] for name in list_male):
                metadata_by_gender[verb]['male'][image_id] = original_metadata
                shutil.copy(path_original_data / image_id, path_dest / 'male')


    with open(path_dataset / 'metadata.json', 'w') as f:
        json.dump(metadata, f)

    with open(path_dataset / 'metadata_by_gender.json', 'w') as f:
        json.dump(metadata_by_gender, f)

    with open(path_dataset / 'target_concepts_count.json', 'w') as f: 
        json.dump(target_concepts_count, f)
        
def filter_concepts(path_dataset, threshold_target_concepts_retained):
    
    target_concepts_count = json.load(open(path_dataset / 'target_concepts_count.json'))

    target_concepts_retained = {}

    for verb in target_concepts_count:
        target_concepts_retained[verb] = []
        for concept in target_concepts_count[verb]:
            if target_concepts_count[verb][concept] >= threshold_target_concepts_retained:
                target_concepts_retained[verb].append(concept)

    with open(path_dataset / 'target_concepts_retained.json', 'w') as f:
        json.dump(target_concepts_retained, f)

def balance_dataset(path_dataset):

    metadata_with_gender = json.load(open(path_dataset / 'metadata_by_gender.json'))

    path_dataset_original = path_dataset / 'original'
    path_dataset_balanced = path_dataset / 'full_balanced'

    metadata_full_balanced = {}

    for target, value in metadata_with_gender.items():
        min_samples = min(len(value['male']), len(value['female']))
        metadata_full_balanced[target] = {'male': [], 'female': []}

        # Note that sort will take '101' before '99'
        for gender in ['male', 'female']:
            path_src = path_dataset_original / target / gender
            image_list = os.listdir(path_src)
            image_list.sort()

            path_dest = path_dataset_balanced / target / gender

            if not os.path.exists(path_dest):
                os.makedirs(path_dest)

            for i in range(min_samples):
                shutil.copy(path_src / image_list[i], path_dest / image_list[i])
                metadata_full_balanced[target][gender].append(metadata_with_gender[target][gender][image_list[i]])

    with open(path_dataset / 'metadata_full_balanced.json', 'w') as f: 
        json.dump(metadata_full_balanced, f)
        
def train_test_split_full_dataset(path_dataset, test_size=0.25, random_seed=0):
    random.seed(random_seed)
    
    path_balanced = path_dataset / 'full_balanced'
    targets = os.listdir(path_balanced)
    
    for target in targets:
        path_target = path_balanced / target
        
        male_images = os.listdir(path_target / 'male')
        female_images = os.listdir(path_target / 'female')
        
        male_train = random.sample(male_images, int(len(male_images) * (1 - test_size)))
        female_train = random.sample(female_images, int(len(female_images) * (1 - test_size)))

        male_test = list(set(male_images) - set(male_train))
        female_test = list(set(female_images) - set(female_train))
        
        path_train = path_dataset / 'train' / 'train_full' / target
        path_test = path_dataset / 'test' / target
        path_test_with_gender = path_dataset / 'test_with_gender' / target

        if not os.path.exists(path_train):
            os.makedirs(path_train / 'male')
            os.makedirs(path_train / 'female')
            
        if not os.path.exists(path_test_with_gender):
            os.makedirs(path_test_with_gender / 'male')
            os.makedirs(path_test_with_gender / 'female')
            
        if not os.path.exists(path_test):
            os.makedirs(path_test)
            
        for image in male_train:
            base, ext = os.path.splitext(image)
            filename = f'{base}_male{ext}'
            shutil.copy(path_target / 'male' / image, path_train / 'male' / filename)
            
        for image in female_train:
            base, ext = os.path.splitext(image)
            filename = f'{base}_female{ext}' 
            shutil.copy(path_target / 'female' / image, path_train / 'female' / filename)
            
        for image in male_test:
            base, ext = os.path.splitext(image)
            filename = f'{base}_male{ext}'
            shutil.copy(path_target / 'male' / image, path_test / filename)
            shutil.copy(path_target / 'male' / image, path_test_with_gender / 'male' / filename)
            
        for image in female_test:
            base, ext = os.path.splitext(image)
            filename = f'{base}_female{ext}' 
            shutil.copy(path_target / 'female' / image, path_test / filename) 
            shutil.copy(path_target / 'female' / image, path_test_with_gender / 'female' / filename) 
        
def take_half_concepts(path_dataset):
    metadata = json.load(open(path_dataset / 'metadata.json'))

    list_target = sorted(metadata.keys())

    cutoff = int(len(list_target) / 2)

    first_half = list_target[:cutoff]
    second_half = list_target[cutoff:]
    
    with open(path_dataset / 'verb_group_1.txt', 'w') as f:
        for verb in first_half:
            f.write(str(verb) + "\n")
        
    with open(path_dataset / 'verb_group_2.txt', 'w') as f:
        for verb in second_half:
            f.write(str(verb) + "\n")

def sample_half_train(path_dataset, random_seed = 0):
    random.seed(random_seed)
    
    targets = os.listdir(path_dataset / 'test')
    path_train_full = path_dataset / 'train' / 'train_full'
    path_train_half = path_dataset / 'train' / 'train_half'
    
    for target in targets:
        os.makedirs(path_train_half / target)
        
        for gender in ['male', 'female']:
            files = os.listdir(path_train_full / target / gender)
            num_files = len(files)
            train_half = random.sample(files, num_files // 2)
            
            for file in train_half:
                shutil.copy(path_train_full / target / gender / file, path_train_half / target / file)
                
                
def train_test_split_functioning_dataset(path_dataset):
    path_test = path_dataset / 'test'
    path_train_full = path_dataset / 'train' / 'train_full'
    path_train_half = path_dataset / 'train' / 'train_half'
    
    path_train_test_split = path_dataset / 'train_test_split'
    path_train_balanced = path_train_test_split / 'train_balanced'
    path_train_imbalanced_1 = path_train_test_split / 'train_imbalanced_1'
    path_train_imbalanced_2 = path_train_test_split / 'train_imbalanced_2'
    
    with open(path_dataset / 'verb_group_1.txt', 'r') as f:
        verbs_group_1 = [line.strip() for line in f.readlines()]
        
    for target in os.listdir(path_test):
        shutil.copytree(path_test / target, path_train_balanced / 'test' / target)
        shutil.copytree(path_test / target, path_train_imbalanced_1 / 'test' / target)
        shutil.copytree(path_test / target, path_train_imbalanced_2 / 'test' / target)
        
        shutil.copytree(path_train_half / target, path_train_balanced / 'train' / target)
        
        if target in verbs_group_1:
            shutil.copytree(path_train_full / target / 'male', path_train_imbalanced_1 / 'train' / target)
            shutil.copytree(path_train_full / target / 'female', path_train_imbalanced_2 / 'train' / target)
        else:
            shutil.copytree(path_train_full / target / 'female', path_train_imbalanced_1 / 'train' / target)
            shutil.copytree(path_train_full / target / 'male', path_train_imbalanced_2 / 'train' / target)
            

## Call functions

In [4]:
string_verbs = """
phoning
hugging
eating
admiring
leaning
putting
carrying
reading
communicating
hitchhiking
resting
vacuuming
talking
cleaning
sitting
patting
hunching
smelling
rehabilitating
perspiring
shushing
helping
practicing
pinning
embracing
drinking
pushing
crying
rubbing
feeding
shouting
tilting
spying
grinning
picking
caressing
licking
interviewing
driving
shivering
milking
grimacing
shelving
checking
coughing
manicuring
lifting
crouching
kissing
gasping
instructing
ignoring
covering
sleeping
smiling
measuring
riding
tasting
stooping
calling
biting
stroking
jogging
distributing
washing
giving
asking
stuffing
tickling
stripping
heaving
swinging
winking
browsing
encouraging
arranging
hanging
squeezing
mopping
photographing
complaining
baking
brushing
walking
wiping
standing
chewing
scrubbing
scratching
wheeling
kneeling
stretching
snuggling
shrugging
telephoning
staring
working
pinching
buying
dialing
kicking
sniffing
opening
speaking
cooking
waving
studying
slapping
slouching
frowning
bothering
praying
adjusting
buttoning
sweeping
applying
yanking
climbing
signaling
displaying
pouting
sneezing
twirling
recovering
stirring
scooping
making
whistling
paying
recuperating
typing
operating
providing
weeping
shopping
glaring
emptying
wrinkling
exercising
strapping
crowning
giggling
running
falling
squinting
gardening
raking
weighing
smashing
tripping
serving
yawning
autographing
rocking
releasing
immersing
writing
smearing
shooting
vaulting
laughing
crafting
packaging
counting
placing
tying
cramming
dragging
saying
pumping
hoeing
twisting
buckling
pouring
painting
lathering
packing
wringing
combing
taping
tearing
pasting
frying
confronting
unpacking
turning
splashing
saluting
waiting
offering
misbehaving
potting
pedaling
pressing
jumping
reassuring
diving
skating
scolding
begging
"""
list_verbs = verbs_list = string_verbs.strip().split("\n")

In [15]:
path_imsitu = Path('data/datasets/imSitu')

path_original_data = path_imsitu / 'original_data'

dataset_name = '200_verbs'
path_dataset = Path('data/datasets/imSitu/data') / dataset_name

threshold_target_concepts_retained = 15

In [6]:
def pipeline(list_verbs, path_imsitu, path_dataset, threshold_target_concepts_retained=15):
    create_original_data_and_metadata(list_verbs, path_imsitu, path_dataset)
    filter_concepts(path_dataset, threshold_target_concepts_retained)
    balance_dataset(path_dataset)
    train_test_split_full_dataset(path_dataset)
    take_half_concept(path_dataset)
    sample_half_train(path_dataset)
    train_test_split_functioning_dataset(path_dataset)

In [None]:
dataset_name = '200_verbs'
path_imsitu = Path('data/datasets/imSitu')
path_dataset = Path('data/datasets/imSitu/data') / dataset_name
threshold_target_concepts_retained = 15

pipeline(list_verbs, path_imsitu, path_dataset, threshold_target_concepts_retained)


## Test

In [36]:
take_half_concepts(path_dataset)

In [6]:
verbs = list_verbs
path_imsitu = Path('data/datasets/imSitu')
full_targets = json.load(open(path_imsitu / 'metadata' / 'full.json'))

selected_targets = {}

for verb in verbs:
    selected_targets[verb] = full_targets[verb]

target_concepts_count = {}
metadata = {}
metadata_by_gender = {}

list_male = ['male', 'man']
list_female = ['female', 'woman']
list_gender = ['male', 'man', 'female', 'woman']

# Loop through verbs
for verb in selected_targets:

    target_concepts_count[verb] = {}
    metadata[verb] = {}
    metadata_by_gender[verb] = {'male': {}, 'female': {}}
    
    path_dest = path_dataset / 'original' / verb
    os.makedirs(path_dest / 'male')
    os.makedirs(path_dest / 'female')

    for image_id, original_metadata in selected_targets[verb].items():
        
        # If agent is gendered, add the metadata and the concepts
        if any(name in original_metadata['agent'] for name in list_gender):
            
            image_agent = original_metadata['agent']
            image_concepts = original_metadata['concepts']
            metadata[verb][image_id] = original_metadata 
            
        for concept in image_concepts:
            if concept not in target_concepts_count[verb]:
                target_concepts_count[verb][concept] = 1
            else:
                target_concepts_count[verb][concept] += 1
            
        # Depending on agent gender, copy metadata and file
        if any(name in original_metadata['agent'] for name in list_female):
            metadata_by_gender[verb]['female'][image_id] = original_metadata
            shutil.copy(path_original_data / image_id, path_dest / 'female')
            
        elif any(name in original_metadata['agent'] for name in list_male):
            metadata_by_gender[verb]['male'][image_id] = original_metadata
            shutil.copy(path_original_data / image_id, path_dest / 'male')

            
with open(path_dataset / 'metadata.json', 'w') as f:
    json.dump(metadata, f)
    
with open(path_dataset / 'metadata_by_gender.json', 'w') as f:
    json.dump(metadata_by_gender, f)

with open(path_dataset / 'target_concepts_count.json', 'w') as f: 
    json.dump(target_concepts_count, f)

In [None]:
target_concepts_retained = {}

for verb in target_concepts_count:
    target_concepts_retained[verb] = []
    for concept in target_concepts_count[verb]:
        if target_concepts_count[verb][concept] >= threshold_target_concepts_retained:
            target_concepts_retained[verb].append(concept)
            
with open(path_dataset / 'target_concepts_retained.json', 'w') as f:
    json.dump(target_concepts_retained, f)

In [24]:
# Take half of the verbs
# TODO: Write the verbs in a .txt file

metadata = json.load(open(path_dataset / 'metadata.json'))

list_target = sorted(metadata.keys())

cutoff = int(len(list_target) / 2)

first_half = list_target[:cutoff]
second_half = list_target[cutoff:]

In [27]:
with open('verb_group_1.txt', 'w') as f:
    for verb in first_half:
        f.write(str(verb) + "\n")
        
with open('verb_group_2.txt', 'w') as f:
    for verb in second_half:
        f.write(str(verb) + "\n")

In [16]:
metadata_with_gender = json.load(open(path_dataset / 'metadata_by_gender.json'))

path_dataset_original = path_dataset / 'original'
path_dataset_balanced = path_dataset / 'full_balanced'

metadata_full_balanced = {}

for target, value in metadata_with_gender.items():
    min_samples = min(len(value['male']), len(value['female']))
    metadata_full_balanced[target] = {'male': [], 'female': []}
    
    # Note that sort will take '101' before '99'
    for gender in ['male', 'female']:
        path_src = path_dataset_original / target / gender
        image_list = os.listdir(path_src)
        image_list.sort()
        
        path_dest = path_dataset_balanced / target / gender
        
        if not os.path.exists(path_dest):
            os.makedirs(path_dest)
        
        for i in range(min_samples):
            shutil.copy(path_src / image_list[i], path_dest / image_list[i])
            metadata_full_balanced[target][gender].append(metadata_with_gender[target][gender][image_list[i]])
            
with open(path_dataset / 'metadata_full_balanced.json', 'w') as f: 
    json.dump(metadata_full_balanced, f)

In [22]:
def train_test_split(path_dataset, test_size=0.25, random_seed=0):
    random.seed(random_seed)
    
    path_balanced = path_dataset / 'full_balanced'
    targets = os.listdir(path_balanced)
    
    for target in targets:
        path_target = path_balanced / target
        
        male_images = os.listdir(path_target / 'male')
        female_images = os.listdir(path_target / 'female')
        
        male_train = random.sample(male_images, int(len(male_images) * (1 - test_size)))
        female_train = random.sample(female_images, int(len(female_images) * (1 - test_size)))

        male_test = list(set(male_images) - set(male_train))
        female_test = list(set(female_images) - set(female_train))
        
        path_train = path_dataset / 'train' / 'train_full' / target
        path_test = path_dataset / 'test' / target
        path_test_with_gender = path_dataset / 'test_with_gender' / target

        if not os.path.exists(path_train):
            os.makedirs(path_train / 'male')
            os.makedirs(path_train / 'female')
            
        if not os.path.exists(path_test_with_gender):
            os.makedirs(path_test / 'male')
            os.makedirs(path_test / 'female')
            
        if not os.path.exists(path_test):
            os.makedirs(path_test)
            
        for image in male_train:
            base, ext = os.path.splitext(image)
            filename = f'{base}_male{ext}'
            shutil.copy(path_target / 'male' / image, path_train / 'male' / filename)
            
        for image in female_train:
            base, ext = os.path.splitext(image)
            filename = f'{base}_female{ext}' 
            shutil.copy(path_target / 'female' / image, path_train / 'female' / filename)
            
        for image in male_test:
            base, ext = os.path.splitext(image)
            filename = f'{base}_male{ext}'
            shutil.copy(path_target / 'male' / image, path_test / filename)
            shutil.copy(path_target / 'male' / image, path_test_with_gender / 'male' / filename)
            
        for image in female_test:
            base, ext = os.path.splitext(image)
            filename = f'{base}_female{ext}' 
            shutil.copy(path_target / 'female' / image, path_test / filename) 
            shutil.copy(path_target / 'female' / image, path_test_with_gender / 'female' / filename) 

train_test_split(path_dataset)

In [47]:
def train_test_split_functioning_dataset(path_dataset):
    path_test = path_dataset / 'test'
    path_train_full = path_dataset / 'train' / 'train_full'
    path_train_half = path_dataset / 'train' / 'train_half'
    
    path_train_test_split = path_dataset / 'train_test_split'
    path_train_balanced = path_train_test_split / 'train_balanced'
    path_train_imbalanced_1 = path_train_test_split / 'train_imbalanced_1'
    path_train_imbalanced_2 = path_train_test_split / 'train_imbalanced_2'
    
    with open(path_dataset / 'verb_group_1.txt', 'r') as f:
        verbs_group_1 = [line.strip() for line in f.readlines()]
        
    for target in os.listdir(path_test):
        shutil.copytree(path_test / target, path_train_balanced / 'test' / target)
        shutil.copytree(path_test / target, path_train_imbalanced_1 / 'test' / target)
        shutil.copytree(path_test / target, path_train_imbalanced_2 / 'test' / target)
        
        shutil.copytree(path_train_half / target, path_train_balanced / 'train' / target)
        
        if target in verbs_group_1:
            shutil.copytree(path_train_full / target / 'male', path_train_imbalanced_1 / 'train' / target)
            shutil.copytree(path_train_full / target / 'female', path_train_imbalanced_2 / 'train' / target)
        else:
            shutil.copytree(path_train_full / target / 'female', path_train_imbalanced_1 / 'train' / target)
            shutil.copytree(path_train_full / target / 'male', path_train_imbalanced_2 / 'train' / target)
                        
train_test_split_functioning_dataset(path_dataset)

In [None]:
full_balanced = 63 * 2
test = 32 
train_full = 47*2
train_half = 46

In [41]:
def sample_half_train(path_dataset, random_seed = 0):
    random.seed(random_seed)
    
    targets = os.listdir(path_dataset / 'test')
    path_train_full = path_dataset / 'train' / 'train_full'
    path_train_half = path_dataset / 'train' / 'train_half'
    
    for target in targets:
        os.makedirs(path_train_half / target)
        
        for gender in ['male', 'female']:
            files = os.listdir(path_train_full / target / gender)
            num_files = len(files)
            train_half = random.sample(files, num_files // 2)
            
            for file in train_half:
                shutil.copy(path_train_full / target / gender / file, path_train_half / target / file)
                
sample_half_train(path_dataset)

In [None]:
# Works only for binary dataset
def create_train_dataset(path_dataset, random_seed=0):

    random.seed(random_seed)
    categories = os.listdir(path_dataset / 'human_images' / 'datasets' / 'original_with_gender_balanced')

    path_train_balanced = path_dataset / 'human_images' / 'datasets' / 'train_balanced'
    path_train_imbalanced_1 = path_dataset / 'human_images' / 'datasets' / 'train_imbalanced_1'
    path_train_imbalanced_2 =  path_dataset / 'human_images' / 'datasets' / 'train_imbalanced_2'

    for i, category in enumerate(categories):
        path_category = path_dataset / 'human_images' / 'datasets' / 'train' / category

        for gender in ['male', 'female']:
            path_category_gender = path_category / gender

            if not os.path.exists(path_train_balanced / category):
                os.makedirs(path_train_balanced / category)
        
            
            files = os.listdir(path_category_gender)
            num_files = len(files)
            train_balanced = random.sample(files, int(num_files // 2)) 

            for file in train_balanced:
                shutil.copy(path_category_gender / file, path_train_balanced / category / file)
            
            if i == 0:
                if gender == 'male':
                    shutil.copytree(path_category_gender, path_train_imbalanced_1 / category)
                    with open(path_dataset / 'human_images' / 'class_male_1.txt', 'w') as f:
                        f.write(f'{category}\n')
                else:
                    shutil.copytree(path_category_gender, path_train_imbalanced_2 / category)
            else:
                if gender == 'female':
                    shutil.copytree(path_category_gender, path_train_imbalanced_1 / category)
                else:
                    shutil.copytree(path_category_gender, path_train_imbalanced_2 / category)




# Code for binary classification

In [None]:
import os
import random
from pathlib import Path
import json
import shutil


def create_dataset(verbs, path_dataset):

    # Path to the directory containing the images
    image_dir = 'data/datasets/imSitu/original_data'

    # Path to the directory where the new folders will be created
    output_dir = path_dataset / 'original'

    # Loop through each verb
    for verb in verbs:
        # Create a new directory for the verb if it doesn't already exist
        verb_dir = os.path.join(output_dir, verb)
        if not os.path.exists(verb_dir):
            os.makedirs(verb_dir)
        
        # Loop through each image in the image directory
        for filename in os.listdir(image_dir):
            # Check if the filename starts with the verb
            if filename.startswith(verb):
                # Copy the image to the verb's directory
                src_path = os.path.join(image_dir, filename)
                dst_path = os.path.join(verb_dir, filename)
                shutil.copy(src_path, dst_path)

def create_targets(verbs, path_dataset, threshold_target_concepts_retained=10):
    """
    Creates the target_agent_count, target_concept_count and target_concepts_retained dictionaries
    """
    path_imsitu = Path('data/datasets/imSitu')
    full_targets = json.load(open(path_imsitu / 'metadata' / 'full.json'))

    selected_targets = {}
    for verb in verbs:
        selected_targets[verb] = full_targets[verb]

    target_concept_count = {}
    target_agent_count = {}
    for verb in selected_targets:
        
        target_concept_count[verb] = {}
        target_agent_count[verb] = {}


        for image in selected_targets[verb].values():
            image_agent = image['agent']
            image_concepts = image['concepts']
            
            if image_agent not in target_agent_count[verb]:
                target_agent_count[verb][image_agent] = 1
            else:
                target_agent_count[verb][image_agent] += 1

            for concept in image_concepts:
                if concept not in target_concept_count[verb]:
                    target_concept_count[verb][concept] = 1
                else:
                    target_concept_count[verb][concept] += 1
            
    target_concepts_retained = {}

    for verb in target_concept_count:
        target_concepts_retained[verb] = []
        for concept in target_concept_count[verb]:
            if target_concept_count[verb][concept] >= threshold_target_concepts_retained:
                target_concepts_retained[verb].append(concept)

    """
    print(target_agent_count)
    print('_'*50)
    print(target_concept_count)
    print('_'*50)
    print(target_concepts_retained)
    print(type(target_agent_count))
    print(type(target_concept_count))
    print(type(target_concepts_retained))
    """

    # Save target_agent_count dictionary to JSON
    with open(path_dataset / 'target_agent_count.json', 'w') as f:
        json.dump(target_agent_count, f)

    # Save target_concept_count dictionary to JSON
    with open(path_dataset / 'target_concept_count.json', 'w') as f:
        json.dump(target_concept_count, f)

    # Save target_concepts_retained dictionary to JSON
    with open(path_dataset / 'target_concepts_retained.json', 'w') as f:
        json.dump(target_concepts_retained, f)

    with open(path_dataset / 'target_original_metadata.json', 'w') as f:
        json.dump(selected_targets, f)

def select_images_with_gender(path_dataset):

    list_gender = ['man', 'male', 'woman', 'female']
    target_original_metadata = json.load(open(path_dataset / 'target_original_metadata.json'))

    entries_with_gender = {}

    for target, images in target_original_metadata.items():
        current_folder = path_dataset / 'original' / target
        path_dest = path_dataset / 'human_images' / 'datasets' /'original' / target
        entries_with_gender[target] = {}

        if not os.path.exists(path_dest):
            os.makedirs(path_dest)

        for image_name, metadata in images.items():
            if any(name in metadata['agent'] for name in list_gender):
                shutil.copy(current_folder / image_name, path_dest / image_name)
                entries_with_gender[target][image_name] = metadata
    
    with open(path_dataset / 'human_images' / 'metadata.json', 'w') as f:
        json.dump(entries_with_gender, f)

def select_images_with_gender_v2(path_dataset):
    """
    Put images with their target and respective gender
    """
    target_original_metadata = json.load(open(path_dataset / 'target_original_metadata.json'))
    list_male = ['man', 'male']
    list_female = ['woman', 'female']
    entries_with_gender = {}

    for target, images in target_original_metadata.items():
        current_folder = path_dataset / 'original' / target
        path_dest = path_dataset / 'human_images' / 'datasets' / 'original_with_gender' / target
        entries_with_gender[target] = {'male': {}, 'female': {}}

        if not os.path.exists(path_dest):
            os.makedirs(path_dest / 'male')
            os.makedirs(path_dest / 'female')

        for image_name, metadata in images.items():
            if any(name in metadata['agent'] for name in list_female):
                entries_with_gender[target]['female'][image_name] = metadata
                shutil.copy(current_folder / image_name, path_dest / 'female' / image_name)
            elif any(name in metadata['agent'] for name in list_male):
                entries_with_gender[target]['male'][image_name] = metadata
                shutil.copy(current_folder / image_name, path_dest / 'male' / image_name)

    # Metadata is same format as target original metadata: dict[target][image] = metadata
    # Metadata V2 is dict[target][gender][image] = metadata
    with open(path_dataset / 'human_images' / 'metadata_v2.json', 'w') as f:
        json.dump(entries_with_gender, f)

def train_test_split(path_dataset, test_size=0.25, random_seed=0):

    random.seed(random_seed)
    categories = os.listdir(path_dataset / 'human_images' / 'datasets' / 'original_with_gender_balanced')

    for category in categories:
        category_path = path_dataset / 'human_images' / 'datasets' / 'original_with_gender_balanced' / category

        male_images = os.listdir(category_path / 'male')
        female_images = os.listdir(category_path / 'female')

        male_train = random.sample(male_images, int(len(male_images) * (1 - test_size)))
        female_train = random.sample(female_images, int(len(female_images) * (1 - test_size)))

        male_test = list(set(male_images) - set(male_train))
        female_test = list(set(female_images) - set(female_train))

        train_path = path_dataset / 'human_images' / 'datasets' / 'train' / category
        test_path = path_dataset / 'human_images' / 'datasets' / 'test' / category
        test_with_gender_path = path_dataset / 'human_images' / 'datasets' / 'test_with_gender' / category 

        if not os.path.exists(train_path):
            os.makedirs(train_path / 'male')
            os.makedirs(train_path / 'female')

        if not os.path.exists(test_path):
            os.makedirs(test_path)

        if not os.path.exists(test_with_gender_path):
            os.makedirs(test_with_gender_path / 'male')
            os.makedirs(test_with_gender_path / 'female')
        
        for image in male_train:
            base, ext = os.path.splitext(image)
            filename = f'{base}_male{ext}'
            shutil.copy(category_path / 'male' / image, train_path / 'male' / filename)
        for image in female_train:
            base, ext = os.path.splitext(image)
            filename = f'{base}_female{ext}' 
            shutil.copy(category_path / 'female' / image, train_path / 'female' / filename)
        for image in male_test:
            base, ext = os.path.splitext(image)
            filename = f'{base}_male{ext}'
            shutil.copy(category_path / 'male' / image, test_path / filename)
            shutil.copy(category_path / 'male' / image, test_with_gender_path / 'male' / filename)
        for image in female_test:
            base, ext = os.path.splitext(image)
            filename = f'{base}_female{ext}' 
            shutil.copy(category_path / 'female' / image, test_path / filename) 
            shutil.copy(category_path / 'female' / image, test_with_gender_path / 'female' / filename) 


def create_copy_with_suffix(suffix):
    def copy_with_suffix(src, dst):
        base, ext = os.path.splitext(dst)
        dst = f"{base}_{suffix}{ext}"
        shutil.copy2(src, dst)
    return copy_with_suffix


# Works only for binary dataset
def create_train_dataset(path_dataset, random_seed=0):

    random.seed(random_seed)
    categories = os.listdir(path_dataset / 'human_images' / 'datasets' / 'original_with_gender_balanced')

    path_train_balanced = path_dataset / 'human_images' / 'datasets' / 'train_balanced'
    path_train_imbalanced_1 = path_dataset / 'human_images' / 'datasets' / 'train_imbalanced_1'
    path_train_imbalanced_2 =  path_dataset / 'human_images' / 'datasets' / 'train_imbalanced_2'

    for i, category in enumerate(categories):
        path_category = path_dataset / 'human_images' / 'datasets' / 'train' / category

        for gender in ['male', 'female']:
            path_category_gender = path_category / gender

            if not os.path.exists(path_train_balanced / category):
                os.makedirs(path_train_balanced / category)
        
            
            files = os.listdir(path_category_gender)
            num_files = len(files)
            train_balanced = random.sample(files, int(num_files // 2)) 

            for file in train_balanced:
                shutil.copy(path_category_gender / file, path_train_balanced / category / file)
            
            if i == 0:
                if gender == 'male':
                    shutil.copytree(path_category_gender, path_train_imbalanced_1 / category)
                    with open(path_dataset / 'human_images' / 'class_male_1.txt', 'w') as f:
                        f.write(f'{category}\n')
                else:
                    shutil.copytree(path_category_gender, path_train_imbalanced_2 / category)
            else:
                if gender == 'female':
                    shutil.copytree(path_category_gender, path_train_imbalanced_1 / category)
                else:
                    shutil.copytree(path_category_gender, path_train_imbalanced_2 / category)



def create_balanced_dataset(path_dataset):

    target_metadata = json.load(open(path_dataset / 'human_images' / 'metadata_v2.json'))

    lowest_number = get_lowest_number(path_dataset)

    path_dataset_balanced = path_dataset / 'human_images' / 'datasets' /'original_with_gender_balanced'

    # Put images in the destination folder 
    # Take a balanced version, i.e. with the same number for every target and gender
    # First destination has also gender, second destination has only targets
    for target in target_metadata.keys():
        for gender in ['male', 'female']:
            
            # The sort take '101' before '99' but it's not a problem
            path_src = path_dataset / 'human_images' /'datasets' / 'original_with_gender' / target /  gender
            image_list = os.listdir(path_src)
            image_list.sort()
            
            path_dest = path_dataset_balanced / target / gender
            
            if not os.path.exists(path_dest):
                os.makedirs(path_dest)
            
            for i in range(lowest_number):
                shutil.copy(path_src / image_list[i] , path_dest / image_list[i])


def get_lowest_number(path_dataset):
    target_metadata = json.load(open(path_dataset / 'human_images' /'metadata_v2.json'))

    lowest_number = 99999
    for target, entries in target_metadata.items():
        for gender, images in entries.items():
            if len(images) < lowest_number:
                lowest_number = len(images)
    
    return lowest_number


def transfer_to_train_test_split(path_dataset):
    path_test = path_dataset / 'human_images' / 'datasets' / 'test'
    path_train_balanced = path_dataset / 'human_images' / 'datasets' / 'train_balanced'
    path_train_imbalanced_1 = path_dataset / 'human_images' / 'datasets' / 'train_imbalanced_1'
    path_train_imbalanced_2 = path_dataset / 'human_images' / 'datasets' / 'train_imbalanced_2'

    path_test_with_gender_original = path_dataset / 'human_images' / 'datasets' / 'test_with_gender'
    path_test_with_gender_new = path_dataset / 'human_images' / 'test'

    path_train_test_split = path_dataset / 'human_images' / 'train_test_split'

    for category in os.listdir(path_test):
        shutil.copytree(path_test / category, path_train_test_split / 'balanced' / 'test' / category)
        shutil.copytree(path_test / category, path_train_test_split / 'imbalanced_1' / 'test' / category)
        shutil.copytree(path_test / category, path_train_test_split / 'imbalanced_2' / 'test' / category)

        shutil.copytree(path_train_balanced / category, path_train_test_split / 'balanced' / 'train' / category)
        shutil.copytree(path_train_imbalanced_1 / category, path_train_test_split / 'imbalanced_1' / 'train' / category)
        shutil.copytree(path_train_imbalanced_2 / category, path_train_test_split / 'imbalanced_2' / 'train' / category)
        for gender in ['male', 'female']:
           shutil.copytree(path_test_with_gender_original / category / gender, path_test_with_gender_new / f'{category}_{gender}') 


def pipeline(verbs, dataset_name):
    path_dataset = Path('data/datasets/imSitu/data') / dataset_name
    create_dataset(verbs, path_dataset)
    create_targets(verbs, path_dataset, 10)
    select_images_with_gender(path_dataset)
    select_images_with_gender_v2(path_dataset)
    create_balanced_dataset(path_dataset)
    train_test_split(path_dataset)
    create_train_dataset(path_dataset)
    transfer_to_train_test_split(path_dataset)
    # create_imbalanced_datasets_binary(path_dataset)
    


phoning_eating = ['phoning', 'eating']
dataset_name = 'phoning_eating'
pipeline(phoning_eating, dataset_name)

#path_dataset = Path('data/datasets/imSitu/data/phoning_cooking')

"""
verbs = ['cooking', 'driving', 'cleaning', 'phoning']
phoning_eating = ['phoning', 'eating']
create_dataset(phoning_eating)
create_targets(phoning_eating, 10)
"""
# select_images_with_gender_v2(path_dataset)
# create_balanced_dataset(Path('data/datasets/imSitu/data/phoning_cooking/human_images'))
# create_imbalanced_datasets_binary(Path('data/datasets/imSitu/data/phoning_cooking/human_images'))