In [None]:
import os
import requests
import json
from PIL import Image, ImageDraw
import argparse
import csv
from tqdm import tqdm

In [1]:


# Define a class for downloading data
class DataDownloader:
    def __init__(self, json_path=None, base_folder=None):
        # Constructor to initialize the DataDownloader object
        self.json_path = json_path
        self.base_folder = base_folder
        if not self.json_path or not self.base_folder:
            self.parse_args()

    def parse_args(self):
        # Parse command-line arguments
        parser = argparse.ArgumentParser(description='Process images based on provided JSON data.')
        parser.add_argument('-j', '--json', required=True, help='Path to the JSON data file.')
        parser.add_argument('-o', '--output', required=True, help='Directory where the images and missing data will be saved.')
        args = parser.parse_args()
        self.json_path = args.json
        self.base_folder = args.output

    def download_image(self, url, path):
        # Download an image from a URL and save it to a specified path
        response = requests.get(url, stream=True)
        with open(path, 'wb') as out_file:
            out_file.write(response.content)

    def check_missing_keys(self, dictionary, required_keys):
        # Check for missing keys in a dictionary
        missing_keys = []
        for key in required_keys:
            if key not in dictionary or dictionary[key] is None:
                missing_keys.append(key)
        return missing_keys

    def log_missing_keys_to_csv(self, csv_writer, type, case_id, post_index, photo_index, missing_keys):
        # Log missing keys to a CSV file
        csv_writer.writerow([type, case_id, post_index, photo_index, ', '.join(missing_keys)])

    def run(self):
        # Main function to run the data downloader
        # Check if the output directory exists, and prompt for overwrite if necessary
        if os.path.exists(self.base_folder):
            overwrite = input("Output directory already exists. Press 'y' to overwrite: ")
            if overwrite.lower() != 'y':
                print("Operation aborted by the user.")
                return

        # Create the base directory if it doesn't exist
        if not os.path.exists(self.base_folder):
            os.mkdir(self.base_folder)

        # Read JSON data from the file
        with open(self.json_path, "r") as file:
            data = json.load(file)

        # Define required keys for cases, posts, and photos
        required_case_keys = ["case_status", "id", "posts"]
        required_post_keys = ["photos"]
        required_photo_keys = ["preview_url", "face_boxes", "enhanced_faces"]

        # Create a path for the CSV file to log missing information
        csv_path = os.path.join(self.base_folder, 'missing_info.csv')

        # Open the CSV file for logging
        with open(csv_path, 'w', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            # Write CSV header
            csv_writer.writerow(['Type', 'Case ID', 'Post Index', 'Photo Index', 'Missing Keys'])

            # Initialize a progress bar for visual feedback during processing
            progress_bar = tqdm(total=len(data), desc="Processing cases")

            # Process each case in the data
            for case in data:
                # Check for missing keys in the case data
                missing_case_keys = self.check_missing_keys(case, required_case_keys)
                if missing_case_keys:
                    self.log_missing_keys_to_csv(csv_writer, 'Case', case.get('id', 'Unknown'), 'N/A', 'N/A', missing_case_keys)
                    progress_bar.update(1)
                    continue

                # Create directories for storing images based on case status and ID
                case_status_folder = os.path.join(self.base_folder, case["case_status"])
                case_id_folder = os.path.join(case_status_folder, case["id"])
                os.makedirs(case_id_folder, exist_ok=True)

                # Process each post within the case
                for post_index, post in enumerate(case["posts"]):
                    # Check for missing keys in post data
                    missing_post_keys = self.check_missing_keys(post, required_post_keys)
                    if missing_post_keys:
                        self.log_missing_keys_to_csv(csv_writer, 'Post', case['id'], post_index, 'N/A', missing_post_keys)
                        continue

                    # Process the post
                    self.process_post(post, case_id_folder, case['id'], post_index, csv_writer)

                # Update the progress bar after each case is processed
                progress_bar.update(1)  

            # Close the progress bar after all cases are processed
            progress_bar.close()

        print("Completed!")

    def process_post(self, post, case_id_folder, case_id, post_index, csv_writer):
        # Process each post by creating necessary folders and processing each photo
        photos_folder = os.path.join(case_id_folder, "photos")
        enhanced_folder = os.path.join(case_id_folder, "enhanced")
        photos_boxed_folder = os.path.join(case_id_folder, "photos_boxed")
        enhanced_boxed_folder = os.path.join(case_id_folder, "enhanced_boxed")

        # Create necessary directories for storing photos
        os.makedirs(photos_folder, exist_ok=True)
        os.makedirs(enhanced_folder, exist_ok=True)
        os.makedirs(photos_boxed_folder, exist_ok=True)
        os.makedirs(enhanced_boxed_folder, exist_ok=True)

        # Define required keys for photos
        required_photo_keys = ["preview_url", "face_boxes", "enhanced_faces"]

        # Process each photo in the post
        for photo_index, photo in enumerate(post["photos"]):
            # Check for missing keys in photo data
            missing_photo_keys = self.check_missing_keys(photo, required_photo_keys)
            if missing_photo_keys:
                self.log_missing_keys_to_csv(csv_writer, 'Photo', case_id, post_index, photo_index, missing_photo_keys)
                continue

            # Process the photo
            self.process_photo(photo, photos_folder, enhanced_folder, photos_boxed_folder, enhanced_boxed_folder, case_id, post_index, photo_index)

    def process_photo(self, photo, photos_folder, enhanced_folder, photos_boxed_folder, enhanced_boxed_folder, case_id, post_index, photo_index):
        # Process individual photo by downloading and creating boxed versions
        # Download and save the original photo
        photo_path = os.path.join(photos_folder, f"{case_id}_photos_{photo_index}.jpg")
        self.download_image(photo["preview_url"], photo_path)

        # Download and save enhanced versions of the photo
        for enhanced_index, enhanced in enumerate(photo["enhanced_faces"]):
            enhanced_path = os.path.join(enhanced_folder, f"{case_id}_enhanced_{enhanced_index}.png")
            self.download_image(enhanced["signed_url"], enhanced_path)

        # Create and save images with face boxes drawn on them
        self.create_boxed_images(photo, photo_path, enhanced_folder, photos_boxed_folder, enhanced_boxed_folder, case_id, post_index, photo_index)

    def create_boxed_images(self, photo, photo_path, enhanced_folder, photos_boxed_folder, enhanced_boxed_folder, case_id, post_index, photo_index):
        # Open the original photo and draw rectangles around faces
        img = Image.open(photo_path)
        draw = ImageDraw.Draw(img)
        for face_box in eval(photo["face_boxes"]):
            draw.rectangle([face_box["x1"], face_box["y1"], face_box["x2"], face_box["y2"]], outline="red")
        # Save the image with face boxes
        img.save(os.path.join(photos_boxed_folder, f"{case_id}_photos_boxed_{photo_index}.jpg"))

        # Process each enhanced photo similarly
        for enhanced_index, enhanced in enumerate(photo["enhanced_faces"]):
            img = Image.open(os.path.join(enhanced_folder, f"{case_id}_enhanced_{enhanced_index}.png"))
            draw = ImageDraw.Draw(img)
            face_box = eval(enhanced["original_face_box"])
            draw.rectangle([face_box["x1"], face_box["y1"], face_box["x2"], face_box["y2"]], outline="red")
            # Save the enhanced image with face boxes
            img.save(os.path.join(enhanced_boxed_folder, f"{case_id}_enhanced_boxed_{enhanced_index}.png"))

# Entry point of the script
if __name__ == "__main__":
    # Create an instance of the DataDownloader and run it with specified JSON path and output folder
    downloader = DataDownloader(json_path='response_1699422658272.json', base_folder='/Users/saisuryavarshith/Documents/Boston_University/Semester_3/SPARKML/PROJECT/Final_projeect/image_data')
    downloader.run()


To run the script you'll need to follow specific steps and ensure you have the correct input variables.

### Running the Script

1. **Prerequisites**:
   - Ensure Python is installed on your system.
   - Install necessary Python libraries, if not already available:
     - `requests` for HTTP requests.
     - `PIL` (Pillow) for image processing.
     - `tqdm` for progress bars.
     - `argparse` for parsing command-line arguments.
     - `csv` for CSV file operations.
   - Ensure you have the JSON data file that the script will process.

2. **Command-Line Arguments**:
   The script accepts two main arguments:
   - `-j` or `--json`: Path to the JSON data file.
   - `-o` or `--output`: Directory where the images and missing data will be saved.

3. **Running the Script**:
   - Open a terminal or command prompt.
   - Navigate to the directory where the script is located.
   - Run the script using Python and provide the necessary arguments. For example:
     ```
     python script_name.py -j path_to_json_file.json -o path_to_output_directory
     ```

### Variables for Input Paths

In the script, these arguments are handled as follows:

1. `json_path`: This variable stores the path to the JSON data file. It is set via the `-j` or `--json` command-line argument.

2. `base_folder`: This variable represents the directory where the images and CSV file for missing data will be saved. It is set using the `-o` or `--output` command-line argument.

### Additional Notes

- If either `json_path` or `base_folder` is not provided through command-line arguments, the script will prompt the user to enter them.
- The script checks if the specified output directory exists. If it does, the user is prompted to confirm overwriting the contents.
- The script then proceeds to process the JSON file, downloading images, checking for missing keys, and organizing the output into the specified directory structure.

### Output: Folder and File Structure

The output is organized in the `image_data` directory, structured as below:

```
image_data/
│
├── missing_info.csv (Logs missing keys from the JSON data)
│
└── [case_status]/ (Directories named after each case's status)
    │
    └── [case_id]/ (Directories named after each case's ID)
        │
        ├── photos/ (Contains downloaded original photos)
        │   ├── [case_id]_post_[post_index]_photo_0.jpg
        │   ├── [case_id]_post_[post_index]_photo_1.jpg
        │   └── ...
        │
        ├── enhanced/ (Contains downloaded enhanced photos)
        │   ├── [case_id]_post_[post_index]_enhanced_0.png
        │   ├── [case_id]_post_[post_index]_enhanced_1.png
        │   └── ...
        │
        ├── photos_boxed/ (Contains original photos with face boxes drawn)
        │   ├── [case_id]_post_[post_index]_photos_boxed_0.jpg
        │   ├── [case_id]_post_[post_index]_photos_boxed_1.jpg
        │   └── ...
        │
        └── enhanced_boxed/ (Contains enhanced photos with face boxes drawn)
            ├── [case_id]_post_[post_index]_enhanced_boxed_0.png
            ├── [case_id]_post_[post_index]_enhanced_boxed_1.png
            └── ...
```

- `missing_info.csv`: This file logs missing data fields (keys) from the JSON input for each level (case, post, photo, enhanced photo).
- Each `case_status` and `case_id` directory contains subdirectories for original photos (`photos`), enhanced photos (`enhanced`), and their respective versions with face boxes drawn (`photos_boxed` and `enhanced_boxed`). The naming convention includes the `case_id`, `post_index`, and the photo index or enhanced photo index for clarity and organization.