commit final code with documentation

Co-authored-by: Varshith Nukala <saisurya@bu.edu>
BU-Spark · Dec 15, 2023 · a31db17 · a31db17
1 parent f94e9e3
commit a31db17
Show file tree

Hide file tree

Showing 81 changed files with 304,425 additions and 2,745 deletions.
diff --git a/fall23/data_downloader.ipynb b/fall23/data_downloader.ipynb
diff --git a/fall23/data_explore/dataexplore.ipynb b/fall23/data_explore/dataexplore.ipynb
diff --git a/fall23/data_organizer.ipynb b/fall23/data_organizer.ipynb
@@ -0,0 +1,211 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70fc2d59",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import shutil\n",
+    "import argparse"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "85a4cd84",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Output directory '/Users/saisuryavarshith/Documents/Boston_University/Semester_3/SPARKML/PROJECT/Final_projeect/extracted_images' already exists. Overwrite? (y/n): y\n",
+      "Extraction complete.\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "class DataOrganizer:\n",
+    "    def __init__(self, input_folder, output_folder, folders_to_extract):\n",
+    "        # Initialize DataOrganizer with the specified input and output folders and the folders to be extracted\n",
+    "        self.input_folder = input_folder\n",
+    "        self.output_folder = output_folder\n",
+    "        self.folders_to_extract = folders_to_extract\n",
+    "\n",
+    "    def run(self):\n",
+    "        # Check if the output directory already exists, and prompt for overwrite if necessary\n",
+    "        if os.path.exists(self.output_folder):\n",
+    "            overwrite = input(f\"Output directory '{self.output_folder}' already exists. Overwrite? (y/n): \")\n",
+    "            if overwrite.lower() != 'y':\n",
+    "                print(\"Operation cancelled.\")\n",
+    "                return\n",
+    "        else:\n",
+    "            os.makedirs(self.output_folder, exist_ok=True)\n",
+    "\n",
+    "        # Iterate over each case status folder in the input directory\n",
+    "        for case_status in os.listdir(self.input_folder):\n",
+    "            case_status_path = os.path.join(self.input_folder, case_status)\n",
+    "            if not os.path.isdir(case_status_path):\n",
+    "                continue\n",
+    "\n",
+    "            # Iterate over each case ID within the case status folder\n",
+    "            for case_id in os.listdir(case_status_path):\n",
+    "                case_id_path = os.path.join(case_status_path, case_id)\n",
+    "                if not os.path.isdir(case_id_path):\n",
+    "                    continue\n",
+    "\n",
+    "                # Iterate over each post folder within the case ID folder\n",
+    "                for post_folder_name in os.listdir(case_id_path):\n",
+    "                    post_path = os.path.join(case_id_path, post_folder_name)\n",
+    "                    if not os.path.isdir(post_path):\n",
+    "                        continue\n",
+    "\n",
+    "                    # Process each post folder\n",
+    "                    self.process_post_folder(post_path, case_status)\n",
+    "\n",
+    "        print(\"Completed Organizing Data!\")\n",
+    "\n",
+    "    def process_post_folder(self, post_path, case_status):\n",
+    "        # Process each specified image type within the post folder\n",
+    "        for image_type in self.folders_to_extract:\n",
+    "            image_type_path = os.path.join(post_path, image_type)\n",
+    "            if os.path.isdir(image_type_path):\n",
+    "                output_type_path = os.path.join(self.output_folder, case_status, image_type)\n",
+    "                os.makedirs(output_type_path, exist_ok=True)\n",
+    "\n",
+    "                # Copy each file from the source to the destination folder\n",
+    "                for filename in os.listdir(image_type_path):\n",
+    "                    source_file = os.path.join(image_type_path, filename)\n",
+    "                    if os.path.isfile(source_file):\n",
+    "                        destination_file = os.path.join(output_type_path, filename)\n",
+    "                        shutil.copyfile(source_file, destination_file)\n",
+    "\n",
+    "def parse_args():\n",
+    "    # Parse command-line arguments for the script\n",
+    "    parser = argparse.ArgumentParser(description=\"Extract specific folders from image data.\")\n",
+    "    parser.add_argument(\"-i\", \"--input\", required=True, help=\"Input folder containing image data.\")\n",
+    "    parser.add_argument(\"-o\", \"--output\", required=True, help=\"Output folder for extracted images.\")\n",
+    "    parser.add_argument(\"-f\", \"--folders\", nargs=\"+\", required=True, help=\"List of folders to extract, e.g., photos enhanced\")\n",
+    "    return parser.parse_args()\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    # Run the script with specified command-line arguments\n",
+    "    args = parse_args()\n",
+    "    extractor = DataOrganizer(input_folder=args.input, output_folder=args.output, folders_to_extract=args.folders)\n",
+    "    extractor.run()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1985ba46",
+   "metadata": {},
+   "source": [
+    "### Inputs of the DataOrganizer Script\n",
+    "\n",
+    "1. **Input Folder**: This is the directory that contains the image data outputted by `DataDownloader`. For example, it could be a folder named 'image_data'.\n",
+    "\n",
+    "2. **Output Folder**: The directory where the organized images will be stored.\n",
+    "\n",
+    "3. **Folders to Extract**: A list of specific folder names within each post folder that the script will process and organize. An example could be `['enhanced']`, indicating that only folders named 'enhanced' within each post folder are to be extracted and organized.\n",
+    "\n",
+    "### Explanation\n",
+    "\n",
+    "- The `DataOrganizer` script takes the output from `DataDownloader` as its input. It then processes this data based on the specified folders to extract, like 'enhanced'.\n",
+    "- The script reorganizes these images into a new directory structure in the output folder, categorizing them by case status and the type of image (e.g., 'enhanced'). This makes it easier to access and manage specific types of images for further use or analysis."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "614c4d17",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "[Output Structure]\n",
+    "        ├── Output Folder (e.g., '/.../extracted_images')\n",
+    "        │\n",
+    "        └───[Organized by Case Status]\n",
+    "            ├── [Case Status] (e.g., 'Missing')\n",
+    "            │   └── [Image Type] (e.g., 'enhanced')\n",
+    "            │       ├── [case_id]_post_[post_index]_enhanced_0.png\n",
+    "            │       ├── [case_id]_post_[post_index]_enhanced_1.png\n",
+    "            │       └── ...\n",
+    "            │\n",
+    "            │\n",
+    "            └── ... (Other case statuses)\n",
+    "                └── [Image Type] (e.g., 'enhanced')\n",
+    "                    ├── [case_id]_post_[post_index]_enhanced_0.png\n",
+    "                    ├── [case_id]_post_[post_index]_enhanced_1.png\n",
+    "                    └── ...\n",
+    "\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6381ae29",
+   "metadata": {},
+   "source": [
+    "### Command-Line Arguments\n",
+    "\n",
+    "1. **Input Folder (`-i` or `--input`)**:\n",
+    "   - **Purpose**: Specifies the path to the input directory that contains the image data outputted by the `DataDownloader` script.\n",
+    "   - **Example Usage**: `-i image_data` or `--input image_data`\n",
+    "   - **Explanation**: This argument allows the user to define where the script should look for the data to be organized.\n",
+    "\n",
+    "2. **Output Folder (`-o` or `--output`)**:\n",
+    "   - **Purpose**: Defines the path to the output directory where the organized images will be stored.\n",
+    "   - **Example Usage**: `-o /path/to/extracted_images` or `--output /path/to/extracted_images`\n",
+    "   - **Explanation**: This parameter determines the destination folder for the reorganized images. If the specified directory does not exist, the script will create it.\n",
+    "\n",
+    "3. **Folders to Extract (`-f` or `--folders`)**:\n",
+    "   - **Purpose**: Indicates which subfolders from each post folder should be extracted and organized.\n",
+    "   - **Example Usage**: `-f enhanced photos` or `--folders enhanced photos`\n",
+    "   - **Explanation**: This argument is used to specify a list of folder names. The script will only process and organize the folders with these names from within each post folder.\n",
+    "\n",
+    "### Usage Example\n",
+    "\n",
+    "To run the `DataOrganizer` script with specific arguments, you would open a terminal or command prompt, navigate to the directory where the script is located, and enter a command like the following:\n",
+    "\n",
+    "```bash\n",
+    "python data_organizer.py -i image_data -o /path/to/extracted_images -f enhanced\n",
+    "```\n",
+    "\n",
+    "In this example, `data_organizer.py` is the name of the script, `image_data` is the directory containing the data from `DataDownloader`, `/path/to/extracted_images` is the target directory for the organized images, and `enhanced` is the type of images to be extracted and organized."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c4a68304",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/fall23/experiment_results_analysis/aggregated_data_cosine.csv b/fall23/experiment_results_analysis/aggregated_data_cosine.csv
diff --git a/fall23/experiment_results_analysis/aggregated_data_euclideanl2.csv b/fall23/experiment_results_analysis/aggregated_data_euclideanl2.csv