In [1]:
import os
import shutil

def organize_images(source_folder, destination_folder):
    # Create the destination folder if it doesn't exist
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    def dfs_search_and_copy(current_folder, relative_path=""):
        # Traverse the directory in a DFS manner
        for item in os.listdir(current_folder):
            item_path = os.path.join(current_folder, item)
            if os.path.isdir(item_path):
                # Continue DFS if it's a directory
                dfs_search_and_copy(item_path, os.path.join(relative_path, item))
            elif item.lower().endswith('.jpg'):
                # Get the direction folder name from the relative path
                direction_folder = relative_path.split(os.sep)[0]
                dest_folder = os.path.join(destination_folder, direction_folder)
                
                # Create the destination direction folder if it doesn't exist
                if not os.path.exists(dest_folder):
                    os.makedirs(dest_folder)
                
                # Copy the image to the corresponding direction folder
                shutil.copy(item_path, os.path.join(dest_folder, item))

    # Start the DFS from the source folder
    dfs_search_and_copy(source_folder)


In [2]:
# Example usage
source_folder = "doppelgangers/images/train_set_noflip/Arc_de_Triomphe_du_Carrousel_by_angle"
destination_folder = "Arc_de_T"
organize_images(source_folder, destination_folder)

# Now randomly create matches of 2 images and create a image pairs dataset

In [3]:
import os
import random
import pandas as pd

def create_unique_image_pairs_dataset(folder_path, output_csv, num_pairs):
    # Step 1: Collect images grouped by direction
    direction_images = {}
    for direction in os.listdir(folder_path):
        direction_path = os.path.join(folder_path, direction)
        if os.path.isdir(direction_path):
            images = [
                os.path.join(direction_path, img)
                for img in os.listdir(direction_path)
                if img.lower().endswith(".jpg")
            ]
            direction_images[direction] = images

    # Step 2: Flatten image paths for easier random selection
    all_images = [(img, direction) for direction, imgs in direction_images.items() for img in imgs]
    
    # Step 3: Generate unique pairs and labels
    data = []
    seen_pairs = set()
    attempts = 0  # To prevent infinite loops if num_pairs exceeds possibilities

    while len(data) < num_pairs and attempts < num_pairs * 10:
        img1, dir1 = random.choice(all_images)
        img2, dir2 = random.choice(all_images)

        # Ensure img1 != img2 and the pair is unique
        if img1 != img2:
            pair = tuple(sorted((img1, img2)))  # Sort to avoid (img1, img2) vs (img2, img1)
            if pair not in seen_pairs:
                seen_pairs.add(pair)
                label = dir1 == dir2  # True if from the same direction, else False
                data.append({"Image1": img1, "Image2": img2, "Label": label})
        
        attempts += 1  # To keep track of how many attempts were made to form unique pairs

    if attempts >= num_pairs * 10:
        print("Warning: Could not generate the desired number of unique pairs.")

    # Step 4: Save to CSV
    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False)
    print(f"Dataset saved to {output_csv} with {len(data)} pairs.")

    return df


In [5]:
# Example usage
folder_path = "Arc_de_T"
output_csv = "Arc_de_T_pairs.csv"
num_pairs = 10000  # Number of random pairs to generate
df = create_unique_image_pairs_dataset(folder_path, output_csv, num_pairs)

Dataset saved to Arc_de_T_pairs.csv with 10000 pairs.


In [6]:
df.head()

Unnamed: 0,Image1,Image2,Label
0,Arc_de_T\front_right\L'Histoire and Arc de Tri...,Arc_de_T\front_left\Carrousel (32579996374).jpg,False
1,Arc_de_T\front_right\Paris 238.jpg,Arc_de_T\front_left\Arc de Triomphe du Carrous...,False
2,"Arc_de_T\back\Street vendor, Jardin des Tuiler...",Arc_de_T\front\Paris Arc de Triomphe du Carrou...,False
3,Arc_de_T\right\Paris - Arc de Triomphe du Carr...,Arc_de_T\front_left\Arc de Triomphe du Carrous...,False
4,"Arc_de_T\front_left\Achille Quinet, Arc du Car...",Arc_de_T\front_left\Paris Arc de Triomphe du C...,True


In [7]:
def compute_label_ratio(df):
    # Compute counts for each label
    label_counts = df['Label'].value_counts()
    
    # Compute ratio
    total = label_counts.sum()
    true_ratio = label_counts.get(True, 0) / total
    false_ratio = label_counts.get(False, 0) / total
    
    print(f"Label Distribution:")
    print(f"True: {label_counts.get(True, 0)} ({true_ratio:.2%})")
    print(f"False: {label_counts.get(False, 0)} ({false_ratio:.2%})")

compute_label_ratio(df)

Label Distribution:
True: 2530 (25.30%)
False: 7470 (74.70%)


# Get smaller portion of the data. 10k is too much 

In [1]:
import pandas as pd

def sample_data(df, total_samples, true_ratio=1/3):
    """
    Samples a DataFrame with a specific ratio of True and False labels.
    
    Parameters:
    - df: pandas DataFrame containing 'img1', 'img2', and 'Label' columns.
    - total_samples: Total number of samples to return.
    - true_ratio: The ratio of True labels in the sample (default is 1/3).
    
    Returns:
    - A pandas DataFrame with the sampled data.
    """
    # Calculate the number of True and False labels based on the desired ratio
    num_true = int(total_samples * true_ratio)
    num_false = total_samples - num_true

    # Separate the DataFrame into True and False labels
    true_df = df[df['Label'] == True]
    false_df = df[df['Label'] == False]

    # Sample the required number of True and False labels
    sampled_true = true_df.sample(n=num_true, random_state=42)
    sampled_false = false_df.sample(n=num_false, random_state=42)

    # Combine the sampled DataFrames
    sampled_df = pd.concat([sampled_true, sampled_false])

    # Shuffle the sampled data to mix True and False labels
    sampled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

    return sampled_df

# Example usage:
# Load your CSV into a DataFrame
df = pd.read_csv('Arc_de_T_pairs.csv')

# Sample 1000 samples with a 1/3 true ratio
sampled_data = sample_data(df, total_samples=6000, true_ratio=1/3)

# Display the sampled data
sampled_data.to_csv("Arc_de_T_6k.csv", index=False)