In [None]:
import os
import shutil
import argparse

In [1]:

class DataOrganizer:
    def __init__(self, input_folder, output_folder, folders_to_extract):
        # Initialize DataOrganizer with the specified input and output folders and the folders to be extracted
        self.input_folder = input_folder
        self.output_folder = output_folder
        self.folders_to_extract = folders_to_extract

    def run(self):
        # Check if the output directory already exists, and prompt for overwrite if necessary
        if os.path.exists(self.output_folder):
            overwrite = input(f"Output directory '{self.output_folder}' already exists. Overwrite? (y/n): ")
            if overwrite.lower() != 'y':
                print("Operation cancelled.")
                return
        else:
            os.makedirs(self.output_folder, exist_ok=True)

        # Iterate over each case status folder in the input directory
        for case_status in os.listdir(self.input_folder):
            case_status_path = os.path.join(self.input_folder, case_status)
            if not os.path.isdir(case_status_path):
                continue

            # Iterate over each case ID within the case status folder
            for case_id in os.listdir(case_status_path):
                case_id_path = os.path.join(case_status_path, case_id)
                if not os.path.isdir(case_id_path):
                    continue

                # Iterate over each post folder within the case ID folder
                for post_folder_name in os.listdir(case_id_path):
                    post_path = os.path.join(case_id_path, post_folder_name)
                    if not os.path.isdir(post_path):
                        continue

                    # Process each post folder
                    self.process_post_folder(post_path, case_status)

        print("Completed Organizing Data!")

    def process_post_folder(self, post_path, case_status):
        # Process each specified image type within the post folder
        for image_type in self.folders_to_extract:
            image_type_path = os.path.join(post_path, image_type)
            if os.path.isdir(image_type_path):
                output_type_path = os.path.join(self.output_folder, case_status, image_type)
                os.makedirs(output_type_path, exist_ok=True)

                # Copy each file from the source to the destination folder
                for filename in os.listdir(image_type_path):
                    source_file = os.path.join(image_type_path, filename)
                    if os.path.isfile(source_file):
                        destination_file = os.path.join(output_type_path, filename)
                        shutil.copyfile(source_file, destination_file)

def parse_args():
    # Parse command-line arguments for the script
    parser = argparse.ArgumentParser(description="Extract specific folders from image data.")
    parser.add_argument("-i", "--input", required=True, help="Input folder containing image data.")
    parser.add_argument("-o", "--output", required=True, help="Output folder for extracted images.")
    parser.add_argument("-f", "--folders", nargs="+", required=True, help="List of folders to extract, e.g., photos enhanced")
    return parser.parse_args()

if __name__ == "__main__":
    # Run the script with specified command-line arguments
    args = parse_args()
    extractor = DataOrganizer(input_folder=args.input, output_folder=args.output, folders_to_extract=args.folders)
    extractor.run()


Output directory '/Users/saisuryavarshith/Documents/Boston_University/Semester_3/SPARKML/PROJECT/Final_projeect/extracted_images' already exists. Overwrite? (y/n): y
Extraction complete.


### Inputs of the DataOrganizer Script

1. **Input Folder**: This is the directory that contains the image data outputted by `DataDownloader`. For example, it could be a folder named 'image_data'.

2. **Output Folder**: The directory where the organized images will be stored.

3. **Folders to Extract**: A list of specific folder names within each post folder that the script will process and organize. An example could be `['enhanced']`, indicating that only folders named 'enhanced' within each post folder are to be extracted and organized.

### Explanation

- The `DataOrganizer` script takes the output from `DataDownloader` as its input. It then processes this data based on the specified folders to extract, like 'enhanced'.
- The script reorganizes these images into a new directory structure in the output folder, categorizing them by case status and the type of image (e.g., 'enhanced'). This makes it easier to access and manage specific types of images for further use or analysis.

```
[Output Structure]
        ├── Output Folder (e.g., '/.../extracted_images')
        │
        └───[Organized by Case Status]
            ├── [Case Status] (e.g., 'Missing')
            │   └── [Image Type] (e.g., 'enhanced')
            │       ├── [case_id]_post_[post_index]_enhanced_0.png
            │       ├── [case_id]_post_[post_index]_enhanced_1.png
            │       └── ...
            │
            │
            └── ... (Other case statuses)
                └── [Image Type] (e.g., 'enhanced')
                    ├── [case_id]_post_[post_index]_enhanced_0.png
                    ├── [case_id]_post_[post_index]_enhanced_1.png
                    └── ...


```

### Command-Line Arguments

1. **Input Folder (`-i` or `--input`)**:
   - **Purpose**: Specifies the path to the input directory that contains the image data outputted by the `DataDownloader` script.
   - **Example Usage**: `-i image_data` or `--input image_data`
   - **Explanation**: This argument allows the user to define where the script should look for the data to be organized.

2. **Output Folder (`-o` or `--output`)**:
   - **Purpose**: Defines the path to the output directory where the organized images will be stored.
   - **Example Usage**: `-o /path/to/extracted_images` or `--output /path/to/extracted_images`
   - **Explanation**: This parameter determines the destination folder for the reorganized images. If the specified directory does not exist, the script will create it.

3. **Folders to Extract (`-f` or `--folders`)**:
   - **Purpose**: Indicates which subfolders from each post folder should be extracted and organized.
   - **Example Usage**: `-f enhanced photos` or `--folders enhanced photos`
   - **Explanation**: This argument is used to specify a list of folder names. The script will only process and organize the folders with these names from within each post folder.

### Usage Example

To run the `DataOrganizer` script with specific arguments, you would open a terminal or command prompt, navigate to the directory where the script is located, and enter a command like the following:

```bash
python data_organizer.py -i image_data -o /path/to/extracted_images -f enhanced
```

In this example, `data_organizer.py` is the name of the script, `image_data` is the directory containing the data from `DataDownloader`, `/path/to/extracted_images` is the target directory for the organized images, and `enhanced` is the type of images to be extracted and organized.