# Objective: Obtain out of fold predictions on the entire training set using cross validation and then using a mean average precision IoU metric, that closely resembles the competition metric, to improve validation

In [1]:
import numpy as np
import pandas as pd

# Prepare out of fold training predictions for implementation of MAP IoU matching competition evaluation description

Load oof predictions from CNN segmentation CV kernel https://www.kaggle.com/cchadha/cnn-segmentation-cv-with-oof-preds-on-train-set/notebook

In [2]:
oof_preds0 = pd.read_csv('submission5.csv')

In [3]:
oof_preds0.head()

Unnamed: 0,patientId,PredictionString
0,c1a1144a-91c5-466b-b707-6303b2dc5500,0.98 614.0 432.0 260.0 282.0
1,0f165be0-0173-4a7c-8411-9b9b29e4f021,
2,c1d15ac6-9205-4063-9916-3856fb9ebff7,0.96 192.0 504.0 190.0 168.0
3,305693d4-6acb-4bf0-90be-4ff2a218689f,
4,11d66121-49b9-4365-8c43-f9d3474e7c52,


Read in training labels

Parse bounding box labels into correct format for Mean Average Precision IoU metric

In [4]:
df = pd.read_csv('input/stage_2_train_labels.csv')
# make a bbox_target column using x, y, width and height.
df['bbox'] = (df['x'].astype(str) +
                    ' ' + 
                    df['y'].astype(str) +
                    ' ' +
                    df['width'].astype(str) +
                    ' ' +
                    df['height'].astype(str))
# 공백을 기준으로 나눔, ','는 기본으로 찍힘
df.loc[:, 'bbox'] = df.loc[:, 'bbox'].map(lambda x: x.split(' '))
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30227 entries, 0 to 30226
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   patientId  30227 non-null  object 
 1   x          9555 non-null   float64
 2   y          9555 non-null   float64
 3   width      9555 non-null   float64
 4   height     9555 non-null   float64
 5   Target     30227 non-null  int64  
 6   bbox       30227 non-null  object 
dtypes: float64(4), int64(1), object(2)
memory usage: 1.6+ MB


In [5]:
# 'patientId'를 기준으로 정렬 후, 동일 patientId가 있을 경우,
# bbox_target값 sum 하는데 string이라 옆으로 합쳐진다.
#df = df.groupby(['patientId'], as_index = False)['bbox_target'].agg('sum')
#df = df.groupby(['patientId'],as_index = False)['bbox'].agg('sum') #왜 이건 안되곰?
#df = df.groupby(['patientId'],as_index = False)['width'].agg('sum') # 왜 이건 되곰?
df = df.groupby(['patientId'],as_index = False)['bbox'].agg('sum')
df.head()
#df = df.agg('sum')
#df = df['bbox'].agg('sum')

AttributeError: 'Series' object has no attribute 'columns'

In [None]:
df.agg('sum')
df.head()

Merge labels and oof preds

In [None]:
#patientId를 기준으로 merge, 동일한 아이디가 없으면 삭제하기!
df = df.merge(oof_preds0, on = 'patientId', how = 'right')
df.head()

In [None]:
#3개의 파일을 merge 해서 PredictionString(x, y, default)로 뜸
df = df.fillna('') # none -> " "로 처리

In [None]:
df.head()

Parse oof preds for MAP IoU

In [None]:
#필요없는 열 제거
#df = df.drop(['PredictionString','PredictionString_x', 'PredictionString_y'], axis=1)
df.loc[:, 'bbox_pred'] = (df.loc[:, 'PredictionString'])
df = df.drop(['PredictionString'], axis=1)
df.head(20)

Stripping whitespace from PredictionString column

In [None]:
# bbox_pred 값을 문자열로 변환 후 문자열의 양끝에서 공백, 탭, 개행문자 등을 제거
df = df.fillna('') # none -> " "로 처리
df.loc[:, 'bbox_pred'] = df.loc[:, 'bbox_pred'].str.strip()
df.tail(20)

In [None]:
# 공백을 기준으로 나눔, ','는 기본으로 찍힘
df.loc[:, 'bbox_pred'] = df.loc[:, 'bbox_pred'].map(lambda x: x.split(' '))
df.head()

In [None]:
def parse_scores(x):
    if len(x)!=1:
        scores = [x[k] for k in range(0,len(x),5)]
        for score in range(len(scores)):
            scores[score] = float(scores[score])
        return np.asarray(scores)
    #score 추출

In [None]:
df.loc[:, 'bbox_scores'] = df.loc[:, 'bbox_pred'].map(parse_scores)

In [None]:
df.tail()

In [None]:
def parse_bbox(x):
    if len(x)!=1:
        bbox = [(x[k]) for k in range(0,len(x)) if k%5 != 0] #int 삭제
        return np.asarray(bbox).reshape(int(len(bbox)/4),4)
    # bbox 추출

In [None]:
df.loc[:, 'bbox_preds'] = df.loc[:, 'bbox_pred'].map(parse_bbox)

In [None]:
df.tail()

In [None]:
#bbox_pred 추출 후 열 제거
df = df.drop(['bbox_pred'], axis=1)

In [None]:
df.tail()

Edit NaN or None values to empty numpy arrays to fit MAP IoU metric implementation

In [None]:
df.loc[df['bbox_scores'].isnull(),['bbox_scores']] = df.loc[df['bbox_scores'].isnull(),'bbox_scores'].apply(lambda x: np.asarray([]))

In [None]:
#bbox_scores null 제거 후 출력
df.tail()

In [None]:
df.loc[df['bbox_preds'].isnull(),['bbox_preds']] = df.loc[df['bbox_preds'].isnull(),'bbox_preds'].apply(lambda x: np.asarray([]))

In [None]:
#bbox_pred null 제거 후 출력
df.head()

In [None]:
def parse_target_str(x):
    if x[0] != 'nan':
        bbox = np.asarray([int(float(x[k])) for k in range(0,len(x))])
        return bbox.reshape(int(len(bbox)/4),4)

In [None]:
df.loc[:,'bbox_target'] = df.loc[:,'bbox_target'].map(parse_target_str)
df.head()
#bbox_target값 4개씩 묶어서 재 배열

In [None]:
df.loc[df['bbox_target'].isnull(),['bbox_target']] = df.loc[df['bbox_target'].isnull(),'bbox_target'].apply(lambda x: np.asarray([]))

In [None]:
df.head()
# bbox_target null 제거 후 출력

# Find mean average precision IoU using implementation by chenyc15 https://www.kaggle.com/chenyc15/mean-average-precision-metric and edited herein

In [None]:
# helper function to calculate IoU
def iou(box1, box2):
    box1 = box1.astype(np.float)
    box2 = box2.astype(np.float)
    x11, y11, w1, h1 = box1
    x21, y21, w2, h2 = box2
    assert w1 * h1 > 0
    assert w2 * h2 > 0
    x12, y12 = x11 + w1, y11 + h1
    x22, y22 = x21 + w2, y21 + h2

    area1, area2 = w1 * h1, w2 * h2
    xi1, yi1, xi2, yi2 = max([x11, x21]), max([y11, y21]), min([x12, x22]), min([y12, y22])
    
    if xi2 <= xi1 or yi2 <= yi1:
        return 0
    else:
        intersect = (xi2-xi1) * (yi2-yi1)
        union = area1 + area2 - intersect
        return intersect / union

In [None]:
def map_iou(boxes_true, boxes_pred, scores, thresholds = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75]):
    """
    Mean average precision at differnet intersection over union (IoU) threshold
    
    input:
        boxes_true: Mx4 numpy array of ground true bounding boxes of one image. 
                    bbox format: (x1, y1, w, h)
        boxes_pred: Nx4 numpy array of predicted bounding boxes of one image. 
                    bbox format: (x1, y1, w, h)
        scores:     length N numpy array of scores associated with predicted bboxes
        thresholds: IoU shresholds to evaluate mean average precision on
    output: 
        map: mean average precision of the image
    """
    
    # According to the introduction, images with no ground truth bboxes will not be 
    # included in the map score unless there is a false positive detection (?)
        
    # return None if both are empty, don't count the image in final evaluation (?)
    if len(boxes_true) == 0 and len(boxes_pred) == 0:
        return None
    elif len(boxes_true) == 0 and len(boxes_pred) > 0:
        return 0
    elif len(boxes_true) > 0 and len(boxes_pred) == 0:
        return 0
    elif len(boxes_true) > 0 and len(boxes_pred) > 0:
        assert boxes_true.shape[1] == 4 or boxes_pred.shape[1] == 4, "boxes should be 2D arrays with shape[1]=4"
        if len(boxes_pred):
            assert len(scores) == len(boxes_pred), "boxes_pred and scores should be same length"
            # sort boxes_pred by scores in decreasing order
            boxes_pred = boxes_pred[np.argsort(scores)[::-1], :]

        map_total = 0
        map_threshold = []

        # loop over thresholds
        for t in thresholds:
            matched_bt = set()
            tp, fn = 0, 0
            for i, bt in enumerate(boxes_true):
                matched = False
                for j, bp in enumerate(boxes_pred):
                    miou = iou(bt, bp)
                    if miou >= t and not matched and j not in matched_bt:
                        matched = True
                        tp += 1 # bt is matched for the first time, count as TP
                        matched_bt.add(j)
                if not matched:
                    fn += 1 # bt has no match, count as FN

            fp = len(boxes_pred) - len(matched_bt) # FP is the bp that not matched to any bt
            m = tp / (tp + fn + fp) #precision value
            map_threshold.append(m)
            map_total += m
    
        map_total=map_total / len(thresholds) #AP per image
        map_threshold.append(map_total)
    
    return map_threshold

In [None]:
#map_iou test, 상위 20개 행만 테스트
for row in range(20):
    print(map_iou(df['bbox_target'][row], df['bbox_preds'][row], df['bbox_scores'][row]))

In [None]:
#non이 아닐때만 map_iou 계산
map_scores = [
    x for x in [map_iou(df['bbox_target'][row], df['bbox_preds'][row], df['bbox_scores'][row]) for row in range(len(df))] if x is not None]

In [None]:
# null -> [0, 0, 0, 0, 0, 0, 0, 0, 0]
for i in range(len(map_scores)):
    if map_scores[i] == 0 :
        del map_scores[i]
        map_scores.insert(i,[0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
new_list = list(map(list, zip(*map_scores))) # reverse, [i][j]->[j][i]

In [None]:
thres_list = []
for i in range(len(new_list)):
    thres_list.append(np.mean(new_list[i])) # AP per thresholds

In [None]:
thres_list #임계치별 AP (0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, (mAP))

In [None]:
np.mean(thres_list)

In [None]:
def AP_N(thres, thres_list):
    thres_sum = []
    thresholds = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75]
    index = thresholds.index(thres)
    for i in range(index, len(thresholds)):
        thres_sum.append(thres_list[i])
    
    return np.mean(thres_sum)           # mAP    

In [None]:
thresholds = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75]

for i in range(len(thresholds)):
    print(AP_N(thresholds[i], thres_list))