# 1. Managing the directories and paths

In [1]:
!pip install --upgrade fiftyone

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com




In [1]:
#if you are having troubles just reboot the pc
import fiftyone as fo
from fiftyone import ViewField as F
from paths import * #paths if our file paths.py
import pandas as pd
import cv2

In [2]:
for path in paths.values():
    if not os.path.exists(path):
        !mkdir -p {path}

In [3]:
for path in paths.values():
    if not os.path.exists(path):
        !mkdir -p {path}

In [4]:
CLASSES=["Dog","Motorcycle", "Van", "Bus", "Bicycle", "Car","Person","Man", "Truck"]

# 2. Download Open image data

In [5]:
train_dataset = fo.zoo.load_zoo_dataset(
              "open-images-v6",
              split="train",
              label_types=["detections"],
              classes=CLASSES,
              dataset_dir=paths["IMAGE_PATH"],
              max_samples=15000,
          )

Downloading split 'train' to 'Pytorch/workspace/images/train' if necessary
Necessary images already downloaded
Existing download of split 'train' is sufficient
Loading 'open-images-v6' split 'train'
 100% |█████████████| 15000/15000 [1.5m elapsed, 0s remaining, 145.1 samples/s]      
Dataset 'open-images-v6-train-15000' created


In [6]:
test_dataset = fo.zoo.load_zoo_dataset(
              "open-images-v6",
              split="test",
              label_types=["detections"],
              classes=CLASSES,
              dataset_dir=paths["IMAGE_PATH"],
              max_samples=300,
          )

Downloading split 'test' to 'Pytorch/workspace/images/test' if necessary
Necessary images already downloaded
Existing download of split 'test' is sufficient
Loading 'open-images-v6' split 'test'
 100% |█████████████████| 300/300 [2.5s elapsed, 0s remaining, 124.7 samples/s]      
Dataset 'open-images-v6-test-300' created


In [7]:
validation_dataset = fo.zoo.load_zoo_dataset(
              "open-images-v6",
              split="validation",
              label_types=["detections"],
              classes=CLASSES,
              dataset_dir=paths["IMAGE_PATH"],
              max_samples=301,
          )

Downloading split 'validation' to 'Pytorch/workspace/images/validation' if necessary
Necessary images already downloaded
Existing download of split 'validation' is sufficient
Loading 'open-images-v6' split 'validation'
 100% |█████████████████| 301/301 [2.1s elapsed, 0s remaining, 139.3 samples/s]      
Dataset 'open-images-v6-validation-301' created


# 3. data pre-processing

In [8]:
# 3. data pre-processing
training_annotation_loc = os.path.join(paths['IMAGE_PATH'],"train","labels","detections.csv")
training_annotation_file = pd.read_csv(training_annotation_loc)
print("records present for training befor filtering: ",training_annotation_file.shape)
training_annotation_file.head()

records present for training befor filtering:  (14610229, 21)


Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,...,IsDepiction,IsInside,XClick1X,XClick2X,XClick3X,XClick4X,XClick1Y,XClick2Y,XClick3Y,XClick4Y
0,000002b66c9c498e,xclick,/m/01g317,1,0.0125,0.195312,0.148438,0.5875,0,1,...,0,0,0.148438,0.0125,0.059375,0.195312,0.148438,0.357812,0.5875,0.325
1,000002b66c9c498e,xclick,/m/01g317,1,0.025,0.276563,0.714063,0.948438,0,1,...,0,0,0.025,0.248438,0.276563,0.214062,0.914062,0.714063,0.782813,0.948438
2,000002b66c9c498e,xclick,/m/01g317,1,0.151562,0.310937,0.198437,0.590625,1,0,...,0,0,0.24375,0.151562,0.310937,0.2625,0.198437,0.434375,0.507812,0.590625
3,000002b66c9c498e,xclick,/m/01g317,1,0.25625,0.429688,0.651563,0.925,1,0,...,0,0,0.315625,0.429688,0.25625,0.423438,0.651563,0.921875,0.826562,0.925
4,000002b66c9c498e,xclick,/m/01g317,1,0.257812,0.346875,0.235938,0.385938,1,0,...,0,0,0.317188,0.257812,0.346875,0.307812,0.235938,0.289062,0.348438,0.385938


In [9]:
# The classes.csv contains the name of all 600 classes with their corresponding ‘LabelName’
class_descriptions_file=os.path.join(paths['IMAGE_PATH'],"train","metadata","classes.csv")
class_descriptions = pd.read_csv(class_descriptions_file, header=None,names=["LabelName", "ClassName"])
class_descriptions.shape

(601, 2)

In [10]:
# Find the label_name for my classes
my_classes = class_descriptions[class_descriptions["ClassName"].isin(CLASSES)]
my_classes

Unnamed: 0,LabelName,ClassName
39,/m/0199g,Bicycle
49,/m/01bjv,Bus
68,/m/01g317,Person
299,/m/04_sv,Motorcycle
307,/m/04yx4,Man
399,/m/07r04,Truck
445,/m/0bt9lr,Dog
534,/m/0h2r6,Van
570,/m/0k4j,Car


**3.1 train data**

In [11]:
#filtering to only have the boxes of our classes
training_filtred_data=training_annotation_file.merge(my_classes, on="LabelName", how='inner')

In [12]:
training_filtred_data["ClassName"].count()

2815390

In [13]:
training_filtred_data['ClassName'].value_counts()

Man           1418594
Person        1034721
Car            248075
Bicycle         40161
Dog             28675
Motorcycle      13382
Truck           12135
Bus             11927
Van              7720
Name: ClassName, dtype: int64

In [14]:
train_df= training_filtred_data.loc[:,['ImageID','XMin','XMax','YMin','YMax','ClassName']]
train_df['ImageID']=train_df['ImageID']+'.jpg'
train_df.head()

Unnamed: 0,ImageID,XMin,XMax,YMin,YMax,ClassName
0,000002b66c9c498e.jpg,0.0125,0.195312,0.148438,0.5875,Person
1,000002b66c9c498e.jpg,0.025,0.276563,0.714063,0.948438,Person
2,000002b66c9c498e.jpg,0.151562,0.310937,0.198437,0.590625,Person
3,000002b66c9c498e.jpg,0.25625,0.429688,0.651563,0.925,Person
4,000002b66c9c498e.jpg,0.257812,0.346875,0.235938,0.385938,Person


In [15]:
train_image_path=os.path.join(paths['IMAGE_PATH'],"train","data")
train_onlyfiles = [f for f in listdir(train_image_path) if isfile(join(train_image_path, f))]
train_images_df=pd.DataFrame(train_onlyfiles,columns=['ImageID'])
train_df=train_df.merge(train_images_df, on="ImageID", how='inner')

**3.2 test data**

In [16]:
testing_annotation_loc = os.path.join(paths['IMAGE_PATH'],"test","labels","detections.csv")
testing_annotation_file = pd.read_csv(testing_annotation_loc)
print("records present for testing befor filtering: ",testing_annotation_file.shape)
testing_filtred_data=testing_annotation_file.merge(my_classes, on="LabelName", how='inner')
testing_filtred_data['ClassName'].value_counts()

records present for testing befor filtering:  (937327, 13)


Person        53385
Car           30153
Man           26231
Dog            5856
Bicycle        1203
Truck          1072
Motorcycle      683
Van             636
Bus             386
Name: ClassName, dtype: int64

In [17]:
test_df= testing_filtred_data.loc[:,['ImageID','XMin','XMax','YMin','YMax','ClassName']]
test_df['ImageID']=test_df['ImageID']+'.jpg'
test_df.head()

Unnamed: 0,ImageID,XMin,XMax,YMin,YMax,ClassName
0,0000c64e1253d68f.jpg,0.0,0.513274,0.320796,0.690265,Car
1,0000c64e1253d68f.jpg,0.016224,0.268437,0.298673,0.462389,Car
2,0000c64e1253d68f.jpg,0.480826,0.90413,0.232301,0.488938,Car
3,0000c64e1253d68f.jpg,0.752212,1.0,0.331858,0.65708,Car
4,001547e7ef44a7d8.jpg,0.0,0.051546,0.261641,0.303769,Car


In [18]:
test_image_path=os.path.join(paths['IMAGE_PATH'],"test","data")
test_onlyfiles = [f for f in listdir(test_image_path) if isfile(join(test_image_path, f))]
test_images_df=pd.DataFrame(test_onlyfiles,columns=['ImageID'])
test_df=test_df.merge(test_images_df, on="ImageID", how='inner')

**3.3 Validation data**

In [19]:
val_annotation_loc = os.path.join(paths['IMAGE_PATH'],"validation","labels","detections.csv")
val_annotation_file = pd.read_csv(val_annotation_loc)
print("records present for validation befor filtering : ",val_annotation_file.shape)
val_filtred_data=val_annotation_file.merge(my_classes, on="LabelName", how='inner')
val_filtred_data['ClassName'].value_counts()

records present for validation befor filtering :  (303980, 13)


Person        16753
Car            9924
Man            8493
Dog            1937
Bicycle         418
Truck           354
Motorcycle      232
Van             197
Bus             103
Name: ClassName, dtype: int64

In [20]:
val_df= val_filtred_data.loc[:,['ImageID','XMin','XMax','YMin','YMax','ClassName']]
val_df['ImageID']=val_df['ImageID']+'.jpg'
val_df.head()

Unnamed: 0,ImageID,XMin,XMax,YMin,YMax,ClassName
0,00075905539074f2.jpg,0.020478,0.329352,0.095602,0.665392,Man
1,00075905539074f2.jpg,0.320819,0.639932,0.0,0.659656,Man
2,00075905539074f2.jpg,0.675768,0.991468,0.172084,0.948375,Man
3,000a1249af2bc5f0.jpg,0.222059,0.998529,0.0,0.988914,Man
4,00141571d986d241.jpg,0.0,0.176563,0.164583,1.0,Man


In [21]:
val_image_path=os.path.join(paths['IMAGE_PATH'],"validation","data")
val_onlyfiles = [f for f in listdir(val_image_path) if isfile(join(val_image_path, f))]
val_images_df=pd.DataFrame(val_onlyfiles,columns=['ImageID'])
val_df=val_df.merge(val_images_df, on="ImageID", how='inner')

# 4. Exporting

**4.1 EXPORTING CVS**

In [22]:
train_df.to_csv(os.path.join(paths['ANNOTATION_TRAIN_PATH'],'train.csv'))
test_df.to_csv(os.path.join(paths['ANNOTATION_TEST_PATH'],'test.csv'))
val_df.to_csv(os.path.join(paths['ANNOTATION_VAL_PATH'],'val.csv'))

**4.2 create annotation fileS**

In [23]:
def df_to_annotation(df,filepath,part="train"):
    with open(filepath, "w+") as f:
        for idx, row in df.iterrows():
            fileName = os.path.join(paths["IMAGE_PATH"],part,'data',row['ImageID'])
            img = cv2.imread(fileName)
            height, width = img.shape[:2]
            x1 = int(row['XMin'] * width)
            x2 = int(row['XMax'] * width)
            y1 = int(row['YMin'] * height)
            y2 = int(row['YMax'] * height)
            className = row['ClassName']
            f.write(fileName + ',' + str(x1) + ',' + str(y1) + ',' + str(x2) + ',' + str(y2) + ',' + className + '\n')

In [24]:
train_annotation_file=os.path.join(paths['ANNOTATION_TRAIN_PATH'],"annotation.txt")
!touch train_annotation_file #create the file if it doesn't exist
#uncomment it if you want to load df from disk
#train_df = pd.read_csv(join(paths['ANNOTATION_TRAIN_PATH'],'train.csv')) 
df_to_annotation(train_df,train_annotation_file,"train")

In [25]:
test_annotation_file=os.path.join(paths['ANNOTATION_TEST_PATH'],"annotation.txt")
!touch test_annotation_file #create the file if it doesn't exist
#uncomment it if you want to load df from disk
#test_df = pd.read_csv(join(paths['ANNOTATION_TEST_PATH'],'test.csv'))
df_to_annotation(test_df,test_annotation_file,"test")

In [26]:
val_annotation_file=os.path.join(paths['ANNOTATION_VAL_PATH'],"annotation.txt")
!touch val_annotation_file #create the file if it doesn't exist
#uncomment it if you want to load df from disk
#val_df = pd.read_csv(join(paths['ANNOTATION_VAL_PATH'],'val.csv'))
df_to_annotation(val_df,val_annotation_file,"validation")

Now we have annotation files in the format path_to_image,x1,y1,x2,y2,className (ex Dog)