In [18]:
import cv2
import numpy as np
import os
from tqdm import tqdm
from pdf2image import convert_from_path
from fpdf import FPDF

import random
from PIL import Image, ImageEnhance

from skimage.util import random_noise

def noise_Gaussian(img):
    img = random_noise(img, mode='gaussian', mean=0, var=0.01, clip=True)
    img = (255*img).astype(np.uint8)
    return img
def noise_SP(img):
    img = random_noise(img, mode='s&p', salt_vs_pepper=0.5, clip=True)
    img = (255*img).astype(np.uint8)
    return img
def blur_Gaussian(img):
    kernel = np.ones((3,3),np.float32)/9
    img = cv2.filter2D(img,-1,kernel)
    return img
def random_crop(img):
    h,w,d = img.shape
    sx,sy = random.randint(0,h//4), random.randint(0,w//4)
    ex,ey = sx + random.randint(h//2,(h//4)*3), sy + random.randint(w//2,(w//4)*3)
    img = img[sx:ex,sy:ey]
    return img
def flip(img):
    h,w,d = img.shape
    for i in range(h):
        for j in range(w//2):
            img[i][j], img[i][w-j-1] = img[i][w-j-1].copy(), img[i][j].copy()
    return img
def random_contrast(img):
    """
    Apply random contrast adjustment to an image using PIL, converted from a numpy array.
    Args:
        img (numpy array): The image to augment.
    Returns:
        numpy array: The contrast adjusted image.
    """
    img_pil = Image.fromarray(img)
    
    factor = random.uniform(0.5, 1.5)
    enhancer = ImageEnhance.Contrast(img_pil)
    
    img_enhanced = enhancer.enhance(factor)
    
    # Convert PIL Image back to numpy array
    return np.array(img_enhanced)

def convert_pdfs_to_images(pdf_dir):
    parent_dir = os.path.dirname(pdf_dir)  
    image_dir = os.path.join(parent_dir, 'scraped_image_pdfs')
    os.makedirs(image_dir, exist_ok=True)

    # Get a list of all files in the directory
    files = os.listdir(pdf_dir)
    pdf_files = [file for file in files if file.endswith('.pdf')]
    for pdf_file in pdf_files:
        images = convert_from_path(os.path.join(pdf_dir, pdf_file))
        
        # Save each page as an image in the new directory
        for i, image in enumerate(images):
            image.save(os.path.join(image_dir, f'{pdf_file[:-4]}_{i}.jpg'), 'JPEG')

def convert_images_to_pdfs(image_dir):
    # Get a list of all jpg files in the directory
    files = os.listdir(image_dir)
    jpg_files = [file for file in files if file.endswith('.jpg')]
    
    for jpg_file in jpg_files:
        # Initialize PDF for each image
        pdf = FPDF()
        pdf.add_page()

        # Open image and convert to RGB if necessary (FPDF requires RGB images)
        image_path = os.path.join(image_dir, jpg_file)
        with Image.open(image_path) as img:
            if img.mode != 'RGB':
                img = img.convert('RGB')
            img_path_rgb = os.path.join(image_dir, 'temp_rgb.jpg')
            img.save(img_path_rgb)

            # Determine dimensions to scale image properly
            width, height = img.size
            aspect_ratio = width / height
            max_width = 190
            max_height = 280
            if aspect_ratio > 1:
                # Landscape orientation
                img_width = max_width
                img_height = img_width / aspect_ratio
            else:
                # Portrait orientation
                img_height = max_height
                img_width = img_height * aspect_ratio

            # Add image to PDF
            pdf.image(img_path_rgb, x=10, y=8, w=img_width, h=img_height)
        
        # Define PDF output path
        output_pdf_path = os.path.join(image_dir, f"{jpg_file[:-4]}.pdf")
        pdf.output(output_pdf_path)
        print(f"PDF created successfully: {output_pdf_path}")
        os.remove(img_path_rgb)  # Clean up temporary file



In [19]:
def noise_augment(img,crop=True,canflip=True,blur=True):
    if (random.random() < 0.5):
        img = noise_Gaussian(img)
    else:
        img = noise_SP(img)
    if crop:
        img = random_crop(img)
    if (random.random()<0.5 and canflip):
        img = flip(img)
    if (random.random()<0.5 and blur):
        img = blur_Gaussian(img)
    return img
def apply_random_augmentations(img):
    """
    Apply a random set of augmentations to an image, ensuring at least one noise augmentation is applied.
    Args:
        img (numpy array): The image to augment.
    Returns:
        numpy array: The augmented image.
    """
    # List of possible augmentation functions
    augmentations = [blur_Gaussian, random_crop, flip, random_contrast]
    
    # Guarantee one noise function
    noise_functions = [noise_Gaussian, noise_SP]
    guaranteed_noise = random.choice(noise_functions)
    
    # Apply the guaranteed noise function first
    img = guaranteed_noise(img)
    
    # Randomly determine the number of additional augmentations to apply (0 to 3)
    num_additional_augmentations = random.randint(1, 3)
    
    # Randomly select that number of additional augmentation functions
    selected_augmentations = random.sample(augmentations, num_additional_augmentations)
        # Apply each selected additional augmentation function to the image
    for func in selected_augmentations:
        img = func(img)
    
    return img


def augment_images_in_directory(input_dir):

    output_dir = os.path.join(input_dir, 'augmented_pdf_images')
    os.makedirs(output_dir, exist_ok=True)

    for filename in os.listdir(input_dir):
        if filename.endswith('.jpg'): 
            img_path = os.path.join(input_dir, filename)
            img = Image.open(img_path)
            img = np.array(img)
            
            #three  augmented images
            for i in range(1, 4):  #loop three times for three different augmentations
                augmented_img = apply_random_augmentations(img)
                
                augmented_img_pil = Image.fromarray(augmented_img.astype('uint8'), 'RGB')
                
                new_filename = f"{filename.rsplit('.', 1)[0]}_augmented_{i}.jpg"
                
                augmented_img_pil.save(os.path.join(output_dir, new_filename))

if __name__=="__main__":
   # path = input("Image directory path:")
    ##convert_pdfs_to_images("/Users/nika/cloudtopology/GoogleScrap/scraped_onepage_pdfs")
    ##augment_images_in_directory("/Users/nika/cloudtopology/GoogleScrap/scraped_image_pdfs")
    convert_images_to_pdfs("/Users/nika/cloudtopology/GoogleScrap/scraped_image_pdfs/augmented_pdf_images")
    #newpath = os.path.join(path,"noise_augmented_imgs")
    #if not os.path.exists(newpath):
        #os.makedirs(newpath)
    #supported_formats = ["jpg",'png','webp']
    #dir_list = os.listdir(path)
    #for i in tqdm(range(len(dir_list))):
       # f = dir_list[i]
        #if f.split(".")[-1] in supported_formats:
          #  img = cv2.imread(os.path.join(path,f))
           # img = noise_augment(img)
           # cv2.imwrite(os.path.join(newpath,f),img)
            

PDF created successfully: /Users/nika/cloudtopology/GoogleScrap/scraped_image_pdfs/augmented_pdf_images/Network-infrastructure-topology-map-4-24_0_augmented_2.pdf
PDF created successfully: /Users/nika/cloudtopology/GoogleScrap/scraped_image_pdfs/augmented_pdf_images/Network-infrastructure-topology-map-4-24_0_augmented_3.pdf
PDF created successfully: /Users/nika/cloudtopology/GoogleScrap/scraped_image_pdfs/augmented_pdf_images/Network-infrastructure-topology-map-4-24_0_augmented_1.pdf
PDF created successfully: /Users/nika/cloudtopology/GoogleScrap/scraped_image_pdfs/augmented_pdf_images/IPTP_Network_PoPs-dec2017_0_augmented_2.pdf
PDF created successfully: /Users/nika/cloudtopology/GoogleScrap/scraped_image_pdfs/augmented_pdf_images/IPTP_Network_PoPs-dec2017_0_augmented_3.pdf
PDF created successfully: /Users/nika/cloudtopology/GoogleScrap/scraped_image_pdfs/augmented_pdf_images/IPTP_Network_PoPs-dec2017_0_augmented_1.pdf
PDF created successfully: /Users/nika/cloudtopology/GoogleScrap/scr