# Task 3 – Distributed Simulation: Split full dataset into 2 nodes and show results table

In [7]:

import os
import time
from multiprocessing import Process, Manager
from PIL import Image
import pandas as pd

input_folder = '/content/drive/MyDrive/Images/PADC_dataset_image'
output_folder = '/content/drive/MyDrive/Images/PADC_dataset_image/outputdistributed2'
os.makedirs(output_folder, exist_ok=True)

# Load watermark image
watermark_img_path = '/content/ich6.jpeg'
watermark = Image.open(watermark_img_path).convert("RGBA").resize((50, 50))

# Gather all image paths in dataset (with subfolder info)
image_rel_paths = []
class_folders = [f for f in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, f))]
for class_folder in class_folders:
    class_path = os.path.join(input_folder, class_folder)
    for img_file in os.listdir(class_path):
        if img_file.lower().endswith(('jpg', 'jpeg', 'png')):
            image_rel_paths.append(os.path.join(class_folder, img_file))

# Split into two (nearly) equal parts
split_point = len(image_rel_paths) // 2
node1_imgs = image_rel_paths[:split_point]
node2_imgs = image_rel_paths[split_point:]

def process_images(image_list, node_name, result_list, idx):
    start = time.perf_counter()
    for rel_path in image_list:
        class_folder, img_file = os.path.split(rel_path)
        in_path = os.path.join(input_folder, class_folder, img_file)
        out_class_path = os.path.join(output_folder, f"{node_name}", class_folder)
        os.makedirs(out_class_path, exist_ok=True)
        out_path = os.path.join(out_class_path, img_file)
        img = Image.open(in_path).resize((128,128)).convert("RGBA")
        img.paste(watermark, (10, 10), watermark)
        img = img.convert("RGB")
        img.save(out_path)
    duration = time.perf_counter() - start
    result_list[idx] = duration

if __name__ == '__main__':
    m = Manager()
    results = m.list([0, 0])
    p1 = Process(target=process_images, args=(node1_imgs, 'node1', results, 0))
    p2 = Process(target=process_images, args=(node2_imgs, 'node2', results, 1))
    t0 = time.perf_counter()
    p1.start()
    p2.start()
    p1.join()
    p2.join()
    t1 = time.perf_counter()

    # Show tabular output
    node_times = [results[0], results[1]]
    df = pd.DataFrame({
        'Node': ['Node 1', 'Node 2'],
        'Duration (sec)': node_times
    })
    df['Percentage'] = 100 * df['Duration (sec)'] / sum(node_times)
    print(df)
    print(f"\nTotal Distributed Simulation Time: {t1-t0:.2f} seconds")


     Node  Duration (sec)  Percentage
0  Node 1        0.000003   53.848695
1  Node 2        0.000003   46.151305

Total Distributed Simulation Time: 0.02 seconds
