In [78]:
import cv2 
import albumentations as A
import sagemaker
import tempfile
import json
import os
import argparse
import copy
import random

In [65]:
import sagemaker
import boto3
# role = sagemaker.get_execution_role()
role = 'arn:aws:iam::527657206104:role/service-role/AmazonSageMaker-ExecutionRole-20230329T093409' #for local mode
from sagemaker.tensorflow import TensorFlowProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker import get_execution_role
AWS_REGION = 'us-east-1'
sagemaker_session = sagemaker.Session(boto3.session.Session(region_name=AWS_REGION))

print(sagemaker_session.boto_region_name)
# Initialize the TensorFlowProcessor
data_preprocessor = TensorFlowProcessor(
    framework_version='2.3',
    role=role,
    instance_type='local',
    instance_count=1,
    base_job_name="img-data-preprocessing",
    py_version='py37'
)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


us-east-1


In [66]:
%%time
# Run the processing job
data_preprocessor.run(
    code='preprocessing.py',
    source_dir='./',
    inputs=[
        ProcessingInput(
            input_name='manifest',
            source='s3://computer-vision-bootcamp/new_dataset/supermarket-dataset/manifests/output/output.manifest',
            destination='/opt/ml/processing/input/manifest'
        ),
        ProcessingInput(
            input_name='images',
            source=f"s3://computer-vision-bootcamp/new_dataset/",
            destination='/opt/ml/processing/input/images'
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name='manifests',
            source='/opt/ml/processing/output/manifests',
            destination=f's3://computer-vision-bootcamp/prepared_data/manifests/'
        ),
        ProcessingOutput(
            output_name='augmented_train_images',
            source='/opt/ml/processing/output/augmented_train_images',
            destination=f's3://computer-vision-bootcamp/prepared_data/train/images/'
        ),
        ProcessingOutput(
            output_name='augmented_validation_images',
            source='/opt/ml/processing/output/augmented_validation_images',
            destination=f's3://computer-vision-bootcamp/prepared_data/validation/images/'
        ),
        ProcessingOutput(
            output_name='augmented_test_images',
            source='/opt/ml/processing/output/augmented_test_images',
            destination=f's3://computer-vision-bootcamp/prepared_data/test/images/'
        )
    ],
    arguments=[
        "--num_augmentations_per_img", str(5),
        "--output_s3_bucket_name", 'computer-vision-bootcamp'
    ]
)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker.processing:Uploaded ./ to s3://sagemaker-us-east-1-527657206104/img-data-preprocessing-2023-05-02-13-23-42-793/source/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-527657206104/img-data-preprocessing-2023-05-02-13-23-42-793/source/runproc.sh
INFO:sagemaker:Creating processing-job with name img-data-preprocessing-2023-05-02-13-23-42-793
INFO:sagemaker.local.local_session:Starting processing job


KeyboardInterrupt: 

In [112]:
import os
print(data)
bbox, classes = extract_bboxes_and_class_ids(data["annotations"])
print(bbox)
img_name = data['image_path'].split('/')[-1]
img_name = base_img_path + img_name
print(img_name)
img = cv2.imread(img_name)
visualize(img, bbox, classes,mapper,True,'test3.jpg' )

[{'image_path': 's3://computer-vision-bootcamp/new_dataset/10.jpg', 'annotation': [{'top': 1045, 'left': 975, 'height': 402, 'width': 685, 'class_id': 3}]}, {'image_path': 's3://computer-vision-bootcamp/new_dataset/11.jpg', 'annotation': [{'top': 825, 'left': 1194, 'height': 601, 'width': 278, 'class_id': 3}]}, {'image_path': 's3://computer-vision-bootcamp/new_dataset/12.jpg', 'annotation': [{'top': 820, 'left': 917, 'height': 580, 'width': 680, 'class_id': 4}]}, {'image_path': 's3://computer-vision-bootcamp/new_dataset/13.jpg', 'annotation': [{'top': 841, 'left': 927, 'height': 559, 'width': 681, 'class_id': 4}]}, {'image_path': 's3://computer-vision-bootcamp/new_dataset/14.jpg', 'annotation': [{'top': 778, 'left': 1053, 'height': 732, 'width': 560, 'class_id': 4}]}, {'image_path': 's3://computer-vision-bootcamp/new_dataset/15.jpg', 'annotation': [{'top': 653, 'left': 1147, 'height': 888, 'width': 403, 'class_id': 4}]}, {'image_path': 's3://computer-vision-bootcamp/new_dataset/16.jpg'

TypeError: list indices must be integers or slices, not str

In [113]:

def extract_bboxes_and_class_ids(ground_truth_annotations):
    '''
    Description : Because of albumation we need to get bboxes and class_ids in seperatelists
    as albumation need data in that format
    
    inputs : dict contains bbox and class_ids
    
    output : - list of lists that contains bbox
             - list of class_ids   
    '''
    bboxes = []
    class_ids = []
    for gt_bbox in ground_truth_annotations:
        xmin = gt_bbox["left"]
        ymin = gt_bbox["top"]
        width = gt_bbox["width"]
        height = gt_bbox["height"]
        bboxes.append([xmin, ymin, width, height])
        class_ids.append(gt_bbox["class_id"])
    return bboxes, class_ids



def read_json_file(path) : 
    '''
    description : function takes path to json file and return data as list 
    inputs : path as (str) 
    outputs : List that contains data
    
    '''
    manifest_file = []
    with open(path) as f :
        for line in f :
            manifest_file.append(json.loads(line))
    return manifest_file




def extract_data_from_ground_truth_json(path):
    '''
        discription : This function take as input path to json file  
        input : path to json file
        output:
            -> list that contains dictionary of 
                                                image_path:(string) & annotaions(list)
            -> sorted dictionary of label mapped to names
    '''
    manifest_file = read_json_file(path)
    extracted_data = []
    class_id_to_name_map = {}
    # extract data from json file 
    for i in manifest_file :
        data = {}
        data['image_path']=i['source-ref']
        annotations =[]
# note that next time remmeber not to agnore label job name and incluse it 
        for x in i['supermarket-dataset']['annotations'] :
            annotation = {}
            annotation['top'] = x['top']
            annotation['left'] = x['left']
            annotation['height'] = x['height']
            annotation['width'] = x['width']
            annotation['class_id'] = x['class_id']

            annotations.append(annotation)
        data['annotation'] = annotations
        extracted_data.append(data)
        
        class_id_to_map = {int(k): v for k, v in i['supermarket-dataset-metadata']['class-map'].items()}
        class_id_to_name_map.update(class_id_to_map)
    return extracted_data, dict(sorted(class_id_to_name_map.items()))


In [169]:
base_dir = '/home/hekal/Downloads/computer_vision_at_the_edge_labs/manuf1-manufacturing-production-line-defect-detection/03-data-preparation/test-dir/train/'
file_path = base_dir + 'annotations.json'
def extract_bboxes_and_class_ids_coco(ground_truth_annotations):
    '''
    Description : Because of albumation we need to get bboxes and class_ids in seperatelists
    as albumation need data in that format
    
    inputs : dict contains bbox and class_ids
    
    output : - list of lists that contains bbox
             - list of class_ids   
    '''
    idxx = ground_truth_annotations['image_id']
    bboxes = []
    class_ids = []
    for gt_bbox in ground_truth_annotations:
        print(ground_truth_annotations)
#         break
#         print(gt_bbox)
#         xmin = gt_bbox[0]
#         ymin = gt_bbox[1]
#         width = gt_bbox[2]
#         height = gt_bbox[3]
#         bboxes.append([xmin, ymin, width, height])
#         class_ids.append(gt_bbox["class_id"])
    return bboxes, class_ids

def convert_bbox_from_gt_to_coco(gt_annotation, img_width, img_height):
    xmin = gt_annotation["left"] / img_width
    ymin = gt_annotation["top"] / img_height
    xmax = (gt_annotation["left"] + gt_annotation["width"]) / img_width
    ymax = (gt_annotation["top"] + gt_annotation["height"]) / img_height
    bbox = [xmin, ymin, xmax, ymax]
    return bbox


def extract_data_from_coco(path, image_number):
    data = read_json_file(path)
    
    idx, file_name, height, width = data[0]['images'][image_number]
    w = data[0]['annotations'][image_number]
    print(data[0]['annotations'][image_number])
    extract_bboxes_and_class_ids_coco(w)
    

extract_data_from_coco(file_path,55)
    


{'image_id': 28, 'category_id': 2, 'bbox': [0.26273148148148145, 0.44598765432098764, 0.4930555555555556, 0.7391975308641975]}
{'image_id': 28, 'category_id': 2, 'bbox': [0.26273148148148145, 0.44598765432098764, 0.4930555555555556, 0.7391975308641975]}
{'image_id': 28, 'category_id': 2, 'bbox': [0.26273148148148145, 0.44598765432098764, 0.4930555555555556, 0.7391975308641975]}
{'image_id': 28, 'category_id': 2, 'bbox': [0.26273148148148145, 0.44598765432098764, 0.4930555555555556, 0.7391975308641975]}


In [127]:
for i in data[50:] :
    print(i)
    image_name = i['image_path'].split('/')[-1]
    image_name = base_dir + image_name
    anno = i['annotation']
    bboxes, class_ids = extract_bboxes_and_class_ids(anno)
    
    image = cv2.imread(image_name)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)
    visualize()
    print(bboxes)
    break

In [108]:
!python preprocessing-Copy1.py --num_augmentations_per_img 10


The number of samples in input data is 49
Peek into a sample of input data
{'image_path': 's3://computer-vision-bootcamp/new_dataset/10.jpg', 'annotation': [{'top': 1045, 'left': 975, 'height': 402, 'width': 685, 'class_id': 3}]}
******************************
Number of samples in train-data : 540
Number of samples in validation-data : 3
Finished data prepration job successfully.


In [171]:
import os
import pathlib

# Clone the tensorflow models repository if it doesn't already exist
if "models" in pathlib.Path.cwd().parts:
    while "models" in pathlib.Path.cwd().parts:
        os.chdir('..')
elif not pathlib.Path('models').exists():
    !git clone --depth 1 https://github.com/tensorflow/models
# %%bash
# # Install the Object Detection API
# cd models/research/
# protoc object_detection/protos/*.proto --python_out=.
# cp object_detection/packages/tf2/setup.py .
# python -m pip install --root-user-action=ignore  --no-warn-conflicts -q .

Cloning into 'models'...
remote: Enumerating objects: 3825, done.[K
remote: Counting objects: 100% (3825/3825), done.[K
remote: Compressing objects: 100% (2926/2926), done.[K
remote: Total 3825 (delta 1109), reused 1947 (delta 849), pack-reused 0[K
Receiving objects: 100% (3825/3825), 49.56 MiB | 2.03 MiB/s, done.
Resolving deltas: 100% (1109/1109), done.
