# Project Setup
*   Clone the repo that contains all necessary files
*   Install and set up all necessary packages
*   Import all required libraries
*   Import Dataset (from Roboflow)








In [None]:
# Team's git repo with all files
# !git clone --quiet https://github.com/AnvitSinha/CS473_ERD_Clustering.git
!git clone --quiet -b stage2 https://github.com/AnvitSinha/CS473_ERD_Clustering.git

# move all files to content/
!mv CS473_ERD_Clustering/* /content/

# remove temp folder
!rm -r CS473_ERD_Clustering/

In [None]:
# Setup

# necessary installs
!pip --quiet install roboflow
!pip --quiet install pyyaml
!pip --quiet install editdistance
!pip --quiet install easyocr

# set up yolo
!git clone --quiet https://github.com/ultralytics/yolov5
%cd yolov5
%pip --quiet install -qr requirements.txt
%cd ..

# necessary imports
import os
from roboflow import Roboflow

# get data from roboflow
rf = Roboflow(api_key="3kZUcOURZwpHNUXrVRYO")
project = rf.workspace("cs473-proj").project("cs473-proj1")
dataset = project.version(5).download("yolov5")

# Directory Structure Setup
*   Unzip all required files to their relevant directories
*   This step assumes that pwd=/content and all zip files are present in the pwd.



# Configure Paths
*   Set global paths that will be utilized throughout the program
*   All of these can be changed, but are set to work directly out of the box if the README instructions are followed



In [None]:
# Configure Paths - Change any as needed

# directory as described in Campuswire #308
GRADING_DIR = "/content/grading/"

# directory where all processed image information will be saved
PROCESSED_DIR = "/content/processed/"

# Path to where yolov5 is stored
YOLO_PATH = "/content/yolov5/"

# Directory for Model Weights
MODEL_DIR = "/content/model_weights"

# Path to model weights"
MODEL_WEIGHTS = os.path.join(MODEL_DIR, "best.pt")

# Path to final result directory
RESULT_DIR = "/content/ocr_results"

# YAML Source for Class names
YAML_PATH = "/content/CS473-Proj1-5/data.yaml"

# Directory to store edit distance results
EDIST_PATH = "/content/edit_dist"

# Final directory to store txt files for submission
SUBMIT_PATH = "/content/submit"

# Threshold for Edit Distance
EDIT_THRESHOLD = 0


In [None]:
# unzipping files

# one drive link to weights, active until 12/22/2023
# https://purdue0-my.sharepoint.com/:f:/g/personal/sinha102_purdue_edu/EjeivgmR47xGvWgRJ0X_XF8BPwuSHlF2Gks0xfilKclXaw?e=t1OR5f

# extract weights assuming the zip file is present in /content and pwd=/content
!unzip -q best_weights_ver5.zip -d model_weights_tmp

# make directory for weights
!mkdir {MODEL_DIR}
!mv /content/model_weights_tmp/content/yolov5/runs/train/step1_train/weights/* {MODEL_DIR}

# remove tmp dir
!rm -r model_weights_tmp/

# unzip grading zip file
!unzip -q grading.zip

# create result directory
!mkdir {RESULT_DIR}

# create edit distance directory
!mkdir {EDIST_PATH}

# create submission directory
!mkdir {SUBMIT_PATH}

# Step 1: Object Detection

*   Use a YoloV5 model trained on Collection 1 to detect ERD objects present in all images in GRADING_DIR.
*   Crop images and place them in a folder for each sample within PROCESSED_DIR, split by the type of ERD object it is.
* Utilizes the object_detection.py script written bu the group.



In [None]:
# Object Detection Step

def process_sample(sample_dir: str):
    """Takes as input the path to a directory and processes it based
        on the specifications provided in the assignment"""

    for curr_file in os.listdir(sample_dir):

        print(f"Processing {curr_file=}")

        if not curr_file.endswith('.png'):
            continue

        !python object_detection.py --weights={MODEL_WEIGHTS} \
        --img_src={os.path.join(sample_dir, curr_file)} \
        --yolo_path={YOLO_PATH} \
        --save_dir={os.path.join(PROCESSED_DIR, sample_dir.split("dataset", 1)[1])} \
        --name={curr_file.rstrip(".png")} \
        --yaml_src={YAML_PATH}

        print(f"processed {curr_file}")



In [None]:
# iterate through all Samples

for sample in os.listdir(GRADING_DIR):

    print(f"Processing {sample}")

    process_sample(os.path.join(GRADING_DIR, sample))


# Step 2: OCR

*   Use the EasyOCR python package to perform OCR on each sample has been detected and sorted based on ERD objects.
*   Results are stored in RESULT_DIR under the file with the namng convetion of SAMPLE.txt, where SAMPLE is the name of the original input image.
* Utilizes the text_ocr.py python script written by the group.



In [None]:
# OCR step

def process_img_text(dataset_name: str):

    !mkdir {os.path.join(RESULT_DIR, dataset_name)}

    for curr_image in os.listdir(os.path.join(PROCESSED_DIR, dataset_name)):

      # set working directory
      WORKING_DIR = os.path.join(PROCESSED_DIR, dataset_name, curr_image, "cropped")

      # iterate all objects in that image
      for object_det in os.listdir(WORKING_DIR):

          # iterate through all cropped images of that object
          for cropped in os.listdir(os.path.join(WORKING_DIR, object_det)):
              # print(os.path.join(RESULT_DIR, dataset_name, curr_image + '.txt'))
              !python text_ocr.py \
              --img_path={os.path.join(WORKING_DIR, object_det, cropped)} \
              --save_dir={os.path.join(RESULT_DIR, dataset_name, curr_image + '.txt')} \
              --object_type={object_det} \
              --include_entity={False}

          print(f"Processed {object_det} for img {curr_image}")

In [None]:
# process all images in each dataset

for curr_dataset in os.listdir(PROCESSED_DIR):

    process_img_text(curr_dataset)

# Step 3: Edit Distance

*   Determines the edit distance between the words detected from the images and the vocabulary present in the question and decided if words need to be modified.
* The decision to modify or not depends on the threshold set.
* Saves the modified files in SUBMIT_PATH.
*   Utilizes the edit_distance.py script written by the team.



In [None]:
# Edit distance

for dataset in os.listdir(RESULT_DIR):

    !mkdir {os.path.join(EDIST_PATH, dataset)}

    for res in os.listdir(os.path.join(RESULT_DIR, dataset)):

        !python edit_distance.py \
        --objects_file={os.path.join(RESULT_DIR, dataset, res)} \
        --question_path={os.path.join(GRADING_DIR, "dataset" + dataset, "question.txt")} \
        --output_file={os.path.join(EDIST_PATH, dataset, res)} \
        --threshold={EDIT_THRESHOLD}

# Run Module 4

*   Run Module 4 to use the baseline clustering method to cluster all entries

In [None]:
# Baseline clustering

def cluster_dataset(dataset: str, k: int):

    !python base_line_clustering.py \
    --dataset_dir={os.path.join(EDIST_PATH, dataset)} \
    --output_file={os.path.join(SUBMIT_PATH, f"base_line_clusters_{dataset}.txt")} \
    --num_clusters={k}


In [None]:
# Run for all datasets

for dataset in os.listdir(EDIST_PATH):

    # get the expected number of clusters
    k = int(input(f"Number of expected clusters for dataset{dataset}: "))

    # cluster the images
    cluster_dataset(dataset, k)

# Store Results

*   Zips the final text files in a file with the team name.



In [None]:
# zip all results into a file wiht team name for grading
%cd {SUBMIT_PATH}
!zip /content/hintonians.zip *
%cd /content