# 1. Managing the directories and paths

In [None]:
!pip install --upgrade fiftyone

In [None]:
#if you are having troubles just reboot the pc
import fiftyone as fo
from fiftyone import ViewField as F
from paths import * #paths if our file paths.py
import pandas as pd
import cv2

In [None]:
for path in paths.values():
    if not os.path.exists(path):
        !mkdir -p {path}

In [None]:
for path in paths.values():
    if not os.path.exists(path):
        !mkdir -p {path}

In [None]:
CLASSES=["Dog","Motorcycle", "Van", "Bus", "Bicycle", "Car","Person","Man", "Truck"]

# 2. Download Open image data

In [None]:
train_dataset = fo.zoo.load_zoo_dataset(
              "open-images-v6",
              split="train",
              label_types=["detections"],
              classes=CLASSES,
              dataset_dir=paths["IMAGE_PATH"],
              max_samples=15000,
          )

In [None]:
test_dataset = fo.zoo.load_zoo_dataset(
              "open-images-v6",
              split="test",
              label_types=["detections"],
              classes=CLASSES,
              dataset_dir=paths["IMAGE_PATH"],
              max_samples=300,
          )

In [None]:
validation_dataset = fo.zoo.load_zoo_dataset(
              "open-images-v6",
              split="validation",
              label_types=["detections"],
              classes=CLASSES,
              dataset_dir=paths["IMAGE_PATH"],
              max_samples=301,
          )

# 3. data pre-processing

In [None]:
# 3. data pre-processing
training_annotation_loc = os.path.join(paths['IMAGE_PATH'],"train","labels","detections.csv")
training_annotation_file = pd.read_csv(training_annotation_loc)
print("records present for training befor filtering: ",training_annotation_file.shape)
training_annotation_file.head()

In [None]:
# The classes.csv contains the name of all 600 classes with their corresponding ‘LabelName’
class_descriptions_file=os.path.join(paths['IMAGE_PATH'],"train","metadata","classes.csv")
class_descriptions = pd.read_csv(class_descriptions_file, header=None,names=["LabelName", "ClassName"])
class_descriptions.shape

In [None]:
# Find the label_name for my classes
my_classes = class_descriptions[class_descriptions["ClassName"].isin(CLASSES)]
my_classes

**3.1 train data**

In [None]:
#filtering to only have the boxes of our classes
training_filtred_data=training_annotation_file.merge(my_classes, on="LabelName", how='inner')

In [None]:
training_filtred_data["ClassName"].count()

In [None]:
training_filtred_data['ClassName'].value_counts()

In [None]:
train_df= training_filtred_data.loc[:,['ImageID','XMin','XMax','YMin','YMax','ClassName']]
train_df['ImageID']=train_df['ImageID']+'.jpg'
train_df.head()

In [None]:
train_image_path=os.path.join(paths['IMAGE_PATH'],"train","data")
train_onlyfiles = [f for f in listdir(train_image_path) if isfile(join(train_image_path, f))]
train_images_df=pd.DataFrame(train_onlyfiles,columns=['ImageID'])
train_df=train_df.merge(train_images_df, on="ImageID", how='inner')

**3.2 test data**

In [None]:
testing_annotation_loc = os.path.join(paths['IMAGE_PATH'],"test","labels","detections.csv")
testing_annotation_file = pd.read_csv(testing_annotation_loc)
print("records present for testing befor filtering: ",testing_annotation_file.shape)
testing_filtred_data=testing_annotation_file.merge(my_classes, on="LabelName", how='inner')
testing_filtred_data['ClassName'].value_counts()

In [None]:
test_df= testing_filtred_data.loc[:,['ImageID','XMin','XMax','YMin','YMax','ClassName']]
test_df['ImageID']=test_df['ImageID']+'.jpg'
test_df.head()

In [None]:
test_image_path=os.path.join(paths['IMAGE_PATH'],"test","data")
test_onlyfiles = [f for f in listdir(test_image_path) if isfile(join(test_image_path, f))]
test_images_df=pd.DataFrame(test_onlyfiles,columns=['ImageID'])
test_df=test_df.merge(test_images_df, on="ImageID", how='inner')

**3.3 Validation data**

In [None]:
val_annotation_loc = os.path.join(paths['IMAGE_PATH'],"validation","labels","detections.csv")
val_annotation_file = pd.read_csv(val_annotation_loc)
print("records present for validation befor filtering : ",val_annotation_file.shape)
val_filtred_data=val_annotation_file.merge(my_classes, on="LabelName", how='inner')
val_filtred_data['ClassName'].value_counts()

In [None]:
val_df= val_filtred_data.loc[:,['ImageID','XMin','XMax','YMin','YMax','ClassName']]
val_df['ImageID']=val_df['ImageID']+'.jpg'
val_df.head()

In [None]:
val_image_path=os.path.join(paths['IMAGE_PATH'],"validation","data")
val_onlyfiles = [f for f in listdir(val_image_path) if isfile(join(val_image_path, f))]
val_images_df=pd.DataFrame(val_onlyfiles,columns=['ImageID'])
val_df=val_df.merge(val_images_df, on="ImageID", how='inner')

# 4. Exporting

**4.1 EXPORTING CVS**

In [None]:
train_df.to_csv(os.path.join(paths['ANNOTATION_TRAIN_PATH'],'train.csv'))
test_df.to_csv(os.path.join(paths['ANNOTATION_TEST_PATH'],'test.csv'))
val_df.to_csv(os.path.join(paths['ANNOTATION_VAL_PATH'],'val.csv'))

**4.2 create annotation fileS**

In [None]:
def df_to_annotation(df,filepath,part="train"):
    with open(filepath, "w+") as f:
        for idx, row in df.iterrows():
            fileName = os.path.join(paths["IMAGE_PATH"],part,'data',row['ImageID'])
            img = cv2.imread(fileName)
            height, width = img.shape[:2]
            x1 = int(row['XMin'] * width)
            x2 = int(row['XMax'] * width)
            y1 = int(row['YMin'] * height)
            y2 = int(row['YMax'] * height)
            className = row['ClassName']
            f.write(fileName + ',' + str(x1) + ',' + str(y1) + ',' + str(x2) + ',' + str(y2) + ',' + className + '\n')

In [None]:
train_annotation_file=os.path.join(paths['ANNOTATION_TRAIN_PATH'],"annotation.txt")
!touch train_annotation_file #create the file if it doesn't exist
#uncomment it if you want to load df from disk
#train_df = pd.read_csv(join(paths['ANNOTATION_TRAIN_PATH'],'train.csv')) 
df_to_annotation(train_df,train_annotation_file,"train")

In [None]:
test_annotation_file=os.path.join(paths['ANNOTATION_TEST_PATH'],"annotation.txt")
!touch test_annotation_file #create the file if it doesn't exist
#uncomment it if you want to load df from disk
#test_df = pd.read_csv(join(paths['ANNOTATION_TEST_PATH'],'test.csv'))
df_to_annotation(test_df,test_annotation_file,"test")

In [None]:
val_annotation_file=os.path.join(paths['ANNOTATION_VAL_PATH'],"annotation.txt")
!touch val_annotation_file #create the file if it doesn't exist
#uncomment it if you want to load df from disk
#val_df = pd.read_csv(join(paths['ANNOTATION_VAL_PATH'],'val.csv'))
df_to_annotation(val_df,val_annotation_file,"validation")

Now we have annotation files in the format path_to_image,x1,y1,x2,y2,className (ex Dog)