In [2]:
import pandas as pd
import numpy as np
import warnings
from pprint import pprint
from tqdm.notebook import tqdm
from glob import glob
from datetime import datetime

warnings.filterwarnings('ignore')

RELATIONSHIPS_FOLDER = '../metadata/relationships'
BBOX_FOLDER = '../metadata/bbox'
LABELS_FILE = '../metadata/class-descriptions-boxable.csv'

# Load the data:

In [57]:
# Dataframes of all relationships in (ImageID, LabelName1, LabelName2, XMin1, XMax1, YMin1, YMax1, XMin2, XMax2, YMin2, YMax2, RelationLabel) format
relationship_dfs = [pd.read_csv(file_name) for file_name in glob(RELATIONSHIPS_FOLDER + '/*.csv')]
# Dataframes of all Bounding Boxes in (ImageID, Source, LabelName, Confidence, XMin, XMax, YMin, YMax, IsOccluded, IsTruncated, IsGroupOf, IsDepiction, IsInside, XClick1X, XClick2X, XClick3X, XClick4X, XClick1Y, XClick2Y, XClick3Y, XClick4Y) format
bbox_dfs = [pd.read_csv(file_name) for file_name in glob(BBOX_FOLDER + '/*.csv')]

In [58]:
# Merge all Dataframes into one dataframe
relationship_df = relationship_dfs[0].append(relationship_dfs[1]).append(relationship_dfs[2])
# Merge all Dataframes into one dataframe
bbox_df = bbox_dfs[0].append(bbox_dfs[1]).append(bbox_dfs[2])

In [60]:
relationship_df.shape

(3284282, 12)

In [63]:
bbox_df.shape

(15851536, 21)

# Clean the data:

In [66]:
# Remove 'is' relationship from the data.
relationship_df_final = relationship_df[relationship_df.RelationshipLabel != 'is']

In [68]:
# Remove images that don't have any labeled relationships.
bbox_df_final = bbox_df[bbox_df.ImageID.isin(relationship_df_final.ImageID.unique().tolist())]

In [170]:
labels_df = pd.read_csv(LABELS_FILE)
labels_dict = {label_id: label_name for label_id, label_name in zip(labels_df.LabelID.tolist(), labels_df.LabelName.tolist())}

def label_to_string(series):
    return series.map(labels_dict.get)

In [100]:
relationship_df_final['LabelName1'] = label_to_string(relationship_df_final['LabelName1'])
relationship_df_final['LabelName2'] = label_to_string(relationship_df_final['LabelName2'])
bbox_df_final['LabelName'] = label_to_string(bbox_df_final['LabelName'])

In [156]:
relationship_df_final = relationship_df_final.reset_index()

In [157]:
grouped_relationships = relationship_df_final.groupby(['LabelName1', 'LabelName2'])
relationship_keys = list(grouped_relationships.groups.keys())

In [171]:
# Get relationships that have equal labels.
# And reorder the columns for half of them.
relationships_to_fix = []
for idx, key in enumerate(relationship_keys):
    if key[::-1] in relationship_keys and key[0] != key[1]:
        relationships_to_fix.append(key)

i = 0
while i < len(relationships_to_fix):
    relationships_to_fix.remove(relationships_to_fix[i][::-1])
    i += 1

In [206]:
def swap_columns(df, columns, index):
    df.loc[index, columns] = df.loc[index, columns[::-1]].values

In [165]:
# Swap labelname1, and labelname2.
for group in relationships_to_fix:
    swap_columns(relationship_df_final, ['LabelName1', 'LabelName2'], grouped_relationships.get_group(group).index)

In [196]:
# Convert 'under' relationship to 'on' by swapping labelname1 and labelname2.
index = relationship_df_final[relationship_df_final.RelationshipLabel == 'under'].index
swap_columns(relationship_df_final, ['LabelName1', 'LabelName2'], index)
relationship_df_final.loc[index, 'RelationshipLabel'] = 'on'

In [223]:
# Gets union of LabelName1 and LabelName2 in Relationship-Dataframe and eliminate duplicate
labels_set = set(relationship_df.LabelName1.unique()).union(relationship_df.LabelName2.unique())

In [226]:
# Removing unnecessary labels from bbox_df
bbox_df_final = bbox_df_final[bbox_df_final.LabelName.isin(labels_set)]

In [257]:
# Removing unnecessary columns
relationship_df_final = relationship_df_final[relationship_df_final.columns.tolist()[1:]]
bbox_df_final = bbox_df_final[bbox_dfs[1].columns.tolist()[]]

# Checkpoint:

In [381]:
date = str(datetime.now()).replace(' ', '_').replace(':', '-').split('.')[0]
relationship_df_final.to_csv(f'../checkpoints/relationships_modified_{date}.csv', index=False)
bbox_df_final.to_csv(f'../checkpoints/bbox_modified_{date}.csv', index=False)

# Load from checkpoint:

In [3]:
checkpoints = glob('../checkpoints/*.csv')

In [4]:
relationship_checkpoint = [checkpoint for checkpoint in checkpoints if 'relationships' in checkpoint][-1]
bbox_checkpoint = [checkpoint for checkpoint in checkpoints if 'bbox' in checkpoint][-1]

In [5]:
relationship_df = pd.read_csv(relationship_checkpoint)
bbox_df = pd.read_csv(bbox_checkpoint)

# Combining Utils:

In [10]:
# Destruct bbox object to better fomat
def get_box(bbox):
    return {
        'left': bbox[1],
        'top': bbox[3],
        'width': bbox[2] - bbox[1],
        'height': bbox[4] - bbox[3]
    }

In [11]:
# calculate IOU (intersection over union) used to check how close two bounding boxes are
def intersection_over_union(box_a, box_b):
    # Determine the coordinates of each of the two boxes
    xA = max(box_a['left'], box_b['left'])
    yA = max(box_a['top'], box_b['top'])
    xB = min(box_a['left'] + box_a['width'], box_b['left']+box_b['width'])
    yB = min(box_a['top'] + box_a['height'], box_b['top']+box_b['height'])

    # Calculate the area of the intersection area
    area_of_intersection = (xB - xA + 1) * (yB - yA + 1)

    # Calculate the area of both rectangles
    box_a_area = (box_a['width'] + 1) * (box_a['height'] + 1)
    box_b_area = (box_b['width'] + 1) * (box_b['height'] + 1)
    # Calculate the area of intersection divided by the area of union
    # Area of union = sum both areas less the area of intersection
    iou = area_of_intersection / float(box_a_area + box_b_area - area_of_intersection)

    # Return the score
    return iou

In [115]:
# Compares labelname and check if their Iou score is very high
def equals(object_a, object_b, box_a, box_b):
    return object_a[0] == object_b[0] and intersection_over_union(box_a, box_b) > 0.98

In [64]:
# Puts the Relationship-Dataframe's in better format 
def get_relationships(relationship_df):
    relationships = []
    for idx, row in relationship_df.iterrows():
        label_1 = [row.LabelName1, row.XMin1, row.XMax1, row.YMin1, row.YMax1]
        label_2 = [row.LabelName2, row.XMin2, row.XMax2, row.YMin2, row.YMax2]
        relationship = [label_1, label_2]
        relationships.append(relationship)
    return relationships

In [84]:
# Puts the BoundingBoxes-Dataframe's in better format 
def get_objects(bbox_df):
    objects = []
    for idx, row in bbox_df.iterrows():
        object_features = [row.LabelName, row.XMin, row.XMax, row.YMin, row.YMax]
        objects.append(object_features)
    return objects

In [71]:
# Gets list (of indices) of objects that have relation with relationships 
def get_relationship_indices(relationships, objects):
    indices = [[] for _ in relationships]
    for i, relationship in enumerate(relationships):
        object_a = relationship[0]
        object_b = relationship[1]
        for j, object_c in enumerate(objects):
            box_a = get_box(object_a)
            box_b = get_box(object_b)
            box_c = get_box(object_c)
            if equals(object_a[0], object_c[0], box_a, box_c) or equals(object_b[0], object_c[0], box_b, box_c):
                indices[i].append(j)
    return indices

In [118]:
# Returns indices that are not in relationship_indices
def get_none_relationships(relationship_indices, objects_count):
    none_relationships = []
    for i in range(objects_count):
        for j in range(i+1, objects_count):
            if not [i, j] in relationship_indices:
                none_relationships.append([i, j])
    return none_relationships

In [160]:
def get_none_rows(none_relationships, objects, relationship_groups, image_id):
    rows = []
    for none_relationship in none_relationships:
        relationship = [objects[none_relationship[0]], objects[none_relationship[1]]]
        row = {
            'ImageID': image_id,
            'RelationshipLabel': 'None'
        }
        if (relationship[0][0], relationship[1][0]) not in relationship_groups:
            relationship = relationship[::-1]
            relationship_groups.append((relationship[0][0], relationship[1][0]))
        for idx, object_features in enumerate(relationship):
            row[f'LabelName{idx+1}'] = object_features[0]
            row[f'XMin{idx+1}'] = object_features[1]        
            row[f'XMax{idx+1}'] = object_features[2]        
            row[f'YMin{idx+1}'] = object_features[3]        
            row[f'YMax{idx+1}'] = object_features[4]
        rows.append(row)
    return rows, relationship_groups

# Combining None Relationships:

In [164]:
relationship_grouped = relationship_df.groupby('ImageID')
bbox_grouped = bbox_df.groupby('ImageID')

In [1]:
relationship_groups = list(relationship_df.groupby(['LabelName1', 'LabelName2']).groups.keys())
relationship_df_none = relationship_df
for name, group in relationship_grouped:
    relationships = get_relationships(group)
    objects = get_objects(bbox_grouped.get_group(name))
    relationship_indices = get_relationship_indices(relationships, objects)
    none_relationships = get_none_relationships(relationship_indices, len(objects))
    none_rows, relationship_groups = get_none_rows(none_relationships, objects, relationship_groups, name)
    relationship_df_none = relationship_df_none.append(pd.DataFrame(none_rows), ignore_index=True, sort=False)

In [194]:
date = str(datetime.now()).replace(' ', '_').replace(':', '-').split('.')[0]
relationship_df_none.to_csv(f'../checkpoints/relationships_none_{date}.csv', index=False)

In [6]:
relationship_df.head()

Unnamed: 0,ImageID,LabelName1,LabelName2,XMin1,XMax1,YMin1,YMax1,XMin2,XMax2,YMin2,YMax2,RelationshipLabel
0,9553b9608577b74b,Man,Sunglasses,0.023404,0.985106,0.038344,0.981595,0.238298,0.759574,0.349693,0.529141,wears
1,c9b609675b0db27e,Wine glass,Wine,0.276549,0.997788,0.0,1.0,0.272124,1.0,0.0,1.0,contain
2,5813f39feb5218c3,Woman,Handbag,0.077088,0.897216,0.0,1.0,0.079229,0.740899,0.004688,0.860937,wears
3,5813f39feb5218c3,Girl,Handbag,0.308351,0.899358,0.0,1.0,0.079229,0.740899,0.004688,0.860937,wears
4,b2b742920d39272f,Man,Sunglasses,0.099278,1.0,0.0,1.0,0.133574,0.916968,0.238267,0.590253,wears
