In [1]:
training_frac = 0.4 # use only 40% of files

seed = 1234

In [2]:
import os, os.path as osp
from sklearn.model_selection import train_test_split
from collections import defaultdict
import itertools
import random
import pandas as pd


def list_join_dir(dir):
    return sorted([osp.join(dir, f) for f in os.listdir(dir)])

def list_images_gts(img_dir, reduce_factor=None):
    images, gts = [], []
    for img_path in list_join_dir(img_dir):
        img_basename = osp.basename(img_path)
        class_id = int(img_basename.split('.')[0].split('_')[0])
        images.append(img_path)
        gts.append(class_id)
    
    if reduce_factor:
        images, _, gts, _ = train_test_split(images, gts, test_size=reduce_factor, stratify=gts, random_state=1234)

    return images, gts

def create_img_pairs(imgs, gts):
    random.seed(seed)
    img_per_class = defaultdict(list)
    for idx, (img, gt) in enumerate(zip(imgs, gts)):
        img_per_class[gt].append(img)
    
    pair_files = []
    pair_labels = [] # 0 for same label, 1 otherwise
    # same label
    for label, value in img_per_class.items():
        pair_files += list(itertools.combinations(value, 2))
    same_len = len(pair_files)
    pair_labels = [0 for i in range(same_len)]

    weight_per_first_img_gt = defaultdict(list)
    for label, value in img_per_class.items():
        for img, gt in zip(imgs, gts):
            w = 0 if gt==label else len(img_per_class[gt])
            weight_per_first_img_gt[label].append(w)
    # different label
    first_imgs = random.choices(list(zip(imgs, gts)), k=same_len)
    for (f_img, f_gt) in first_imgs:
        s_img = random.choices(imgs, weights=weight_per_first_img_gt[f_gt])[0]
        pair_files.append((f_img, s_img))
    pair_labels += [1 for i in range(same_len)]


    return pair_files, pair_labels



imgs, gts = list_images_gts('/home/chris/Desktop/mestrado_repos/materias/mo434/project/dataset/project_dataset_corel/fold0/train', reduce_factor=1-training_frac)
pair_files = create_img_pairs(imgs, gts)

df = pd.DataFrame(pair_files, columns=)




In [3]:

pair_files


[('/home/chris/Desktop/mestrado_repos/materias/mo434/project/dataset/project_dataset_corel/fold0/train/004_00027.png',
  '/home/chris/Desktop/mestrado_repos/materias/mo434/project/dataset/project_dataset_corel/fold0/train/004_00018.png'),
 ('/home/chris/Desktop/mestrado_repos/materias/mo434/project/dataset/project_dataset_corel/fold0/train/004_00027.png',
  '/home/chris/Desktop/mestrado_repos/materias/mo434/project/dataset/project_dataset_corel/fold0/train/004_00017.png'),
 ('/home/chris/Desktop/mestrado_repos/materias/mo434/project/dataset/project_dataset_corel/fold0/train/004_00027.png',
  '/home/chris/Desktop/mestrado_repos/materias/mo434/project/dataset/project_dataset_corel/fold0/train/004_00013.png'),
 ('/home/chris/Desktop/mestrado_repos/materias/mo434/project/dataset/project_dataset_corel/fold0/train/004_00027.png',
  '/home/chris/Desktop/mestrado_repos/materias/mo434/project/dataset/project_dataset_corel/fold0/train/004_00039.png'),
 ('/home/chris/Desktop/mestrado_repos/materi

In [4]:
len(pair_files)

772