In [1]:
import os
import torch
import json
import glob
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from random import shuffle
import albumentations as A
from albumentations.pytorch import ToTensorV2
import pandas as pd
import cv2
from tqdm import tqdm
from collections import defaultdict
import sys
import dawid_skene
import numpy as np
import shutil
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import balanced_accuracy_score

  data = yaml.load(f.read()) or {}


In [2]:
attributes_list = ["bag", "backpack"]

In [3]:
def get_model(path) -> bool:
    if os.path.exists(path):
        model = torch.load(path)
        model.to(torch.device('cuda'))
        model.eval()
        return model
    else:
        sys.exit(f"Model {path} not found")

### 1. Specify the paths to models and data

In [4]:
# Ensembling
models_path = [
    f'/home/local/Attributes-pytorch/Results/04_04_22/Stage7Baselinev3_200/best_model.pth',
    f'/home/local/Attributes-pytorch/Results/04_04_22/Stage6Baselinev3_200_1e-3_normalized/best_model.pth',
    f'/home/local/Attributes-pytorch/Results/04_04_22/Stage6Baselinev3_200_1e-3/best_model.pth',
    f'/home/local/Attributes-pytorch/Results/04_04_22/Stage6Baselinev2_200/best_model.pth',
]

# Data for checking
dataset_path = '/home/local/Attributes-pytorch/data/Datasets/Bags/Test/images'

### 2. Build the "matrix of annotations"

In [5]:
# input shape
w = 200
h = 200
# input transform params
transform_test = A.Compose([
            A.Resize(width=w + 20, height=h + 20),
            A.RandomCrop(width=w, height=h),
            A.HorizontalFlip(p=0.5),
            A.RandomBrightnessContrast(p=0.2),
            A.Blur(blur_limit=4, p=0.5),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

In [6]:
# For the matrix format, see dawid_skene.py
matrix = defaultdict(lambda: defaultdict(dict))

# Classes
attributes = ['Bag', 'Bakpack']
multiclass_matrix = dict()
for attribute in attributes:
    multiclass_matrix[attribute] = matrix.copy()

for model_id, model_path in enumerate(models_path):
    model = get_model(model_path)       
    img_names = os.listdir(dataset_path)
    for img_id, img_name in enumerate(tqdm(img_names, f'Model - {model_id}')):
        # open image 
        img = cv2.imread(os.path.join(dataset_path, img_name))
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # prepare to cnn input
        img_rgb = transform_test(image=img_rgb)['image']
        img_rgb = torch.tensor(img_rgb, dtype=torch.float).permute((2, 0, 1))
        
        # run model
        x = torch.unsqueeze(img_rgb, 0)
        with torch.no_grad():
            output = model(x.to(torch.device('cuda')))
        
        # store output
        for i, attribute in enumerate(attributes):
            pr_label = torch.argmax(output[i].to(torch.device('cpu')), dim=1).tolist()
            softmax = torch.nn.Softmax(dim=1)
            pr_score = torch.max(softmax(output[i]).to(torch.device('cpu')), dim=1).values.tolist()
            multiclass_matrix[attribute][img_name][model_id] = pr_label
            
json.dump(multiclass_matrix, open("generated/multiclass_matrix.json", "w"))

Model - 0: 100%|██████████| 20953/20953 [02:58<00:00, 117.46it/s]
Model - 1: 100%|██████████| 20953/20953 [02:57<00:00, 117.79it/s]
Model - 2: 100%|██████████| 20953/20953 [02:58<00:00, 117.64it/s]
Model - 3: 100%|██████████| 20953/20953 [02:58<00:00, 117.17it/s]


### 3. Apply Dawid-Skene algo

In [7]:
multiclass_matrix = json.load(open("generated/multiclass_matrix.json"))

# Classes
attributes = ['Bag', 'Bakpack']
multiclass_soft_pseudo_labels = dict()
for attribute in attributes:
    print("======================================")
    print(attribute)
    print("======================================")
    multiclass_soft_pseudo_labels[attribute] = dawid_skene.run(multiclass_matrix[attribute])
    
json.dump(multiclass_soft_pseudo_labels, open("generated/soft_pseudo_labels.json", "w"))

Bag
num Patients: 20953
Observers: ['0', '1', '2', '3']
Classes: [0, 1, 2, 3]
Iter	log-likelihood	delta-CM	delta-ER
1 	 -75738.2680160249
2 	 -75600.48632352907 	0.009826	0.807683
3 	 -75568.36606407295 	0.008409	0.294520
4 	 -75548.73006939075 	0.006853	0.219004
5 	 -75535.51543657664 	0.005900	0.176649
6 	 -75526.409159969 	0.005073	0.144371
7 	 -75520.0472107431 	0.004366	0.119315
8 	 -75515.54965106155 	0.003766	0.099445
9 	 -75512.3338092569 	0.003257	0.083870
10 	 -75510.009397918 	0.002825	0.071390
11 	 -75508.31238180827 	0.002456	0.061013
12 	 -75507.06212595274 	0.002141	0.052376
13 	 -75506.13353836263 	0.001870	0.045101
14 	 -75505.43891577641 	0.001637	0.038925
15 	 -75504.91603373476 	0.001435	0.033674
16 	 -75504.52025320723 	0.001260	0.029195
17 	 -75504.21922480552 	0.001107	0.025361
18 	 -75503.98928949448 	0.000974	0.022071
19 	 -75503.81300106349 	0.000858	0.019241
20 	 -75503.6773994621 	0.000757	0.016801
21 	 -75503.5727935192 	0.000668	0.014694
22 	 -75503.491893

### 4. Analyse soft pseudo labels

#### 4.1 Concatinate with gt

In [13]:
multiclass_soft_pseudo_labels = json.load(open("generated/soft_pseudo_labels_freq.json"))
results = pd.DataFrame(columns=['filename', 'bag_conf', 'backpack_conf', 'bag_pr', 'backpack_pr', 'bag_gt', 'backpack_gt'])
markup = json.load(open(os.path.join(dataset_path, "..", "markup.json")))

attributes = ['Bag', 'Bakpack']
# Add info in dataframe
for filename in tqdm(list(markup.keys())):
    # predicted_values
    pr_labels = [np.array(multiclass_soft_pseudo_labels[attribute][filename]).argmax() \
                 for attribute in attributes]
    pr_scores = [np.array(multiclass_soft_pseudo_labels[attribute][filename]).max() \
                 for attribute in attributes]
                     
    gt_labels = [label for label in markup[filename][0]]
    row = [filename] + pr_scores + pr_labels + gt_labels
    results.loc[len(results.index)] = row

results.to_csv("generated/concatinated.csv")
results.head()

100%|██████████| 20953/20953 [01:11<00:00, 294.17it/s]


Unnamed: 0,filename,bag_conf,backpack_conf,bag_pr,backpack_pr,bag_gt,backpack_gt
0,019076.jpg,0.999598,1.0,3,0,-1,1
1,019100.jpg,0.999984,1.0,3,0,3,1
2,019110.jpg,1.0,1.0,3,1,3,1
3,019126.jpg,0.999896,1.0,3,1,3,1
4,019129.jpg,0.99368,1.0,0,1,0,1


#### 4.2 Find errors

In [11]:
results = pd.read_csv("generated/concatinated.csv", index_col=0)
results.head()

Unnamed: 0,filename,bag_conf,backpack_conf,bag_pr,backpack_pr,bag_gt,backpack_gt
0,019076.jpg,0.999598,1.0,3,0,-1,1
1,019100.jpg,0.999984,1.0,3,0,3,1
2,019110.jpg,1.0,1.0,3,1,3,1
3,019126.jpg,0.999896,1.0,3,1,3,1
4,019129.jpg,0.99368,1.0,0,1,0,1


#### 4.3 Visualize strong errors

In [9]:
threshold = 0.95

In [10]:
def to_text(val: int, attribute: str):
    if val == -1:
        return "Непонятно"
    descr = {
        'bag': ['hand_bag', 'shoulder_bag', 'suitcase', 'nothing'],
        'backpack': ['Yes', 'No']
    }
    return f'{descr[attribute][val]}'

In [11]:
attributes_list = ["bag", "backpack"]

base_path = f"strong_errors/BagsTest"
if not os.path.exists(base_path):
    os.makedirs(base_path)
    
for attribute in attributes_list:
    values = set(results[f"{attribute}_pr"].tolist())
    dir_path = f"{base_path}/{attribute}"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    for val in values:
        dir_path = f"{base_path}/{attribute}/{to_text(val, attribute)}"
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        files = glob.glob(f"{base_path}/{attribute}/{to_text(val, attribute)}/*.jpg")
        for f in files:
            os.remove(f)

        ranked = results[results[f"{attribute}_pr"] == val]
        ranked = ranked[ranked[f"{attribute}_gt"] != val]
        ranked = ranked[ranked[f"{attribute}_conf"] > threshold]
        for name, conf, gt in zip(ranked.filename.tolist(), ranked[f"{attribute}_conf"].tolist(), ranked[f"{attribute}_gt"].tolist()):
            path = os.path.join(dataset_path, name)
            shutil.copy(path, f"{base_path}/{attribute}/{to_text(val, attribute)}")
            old_name = path.split("/")[-1]
            new_name = f"conf_{int(conf * 100)}_name_{old_name}_gt_{to_text(gt, attribute)}"
            shutil.move(f"{base_path}/{attribute}/{to_text(val, attribute)}/{old_name}", 
                        f"{base_path}/{attribute}/{to_text(val, attribute)}/{new_name}")

### Perfomance evaluation

In [14]:
mapping = {
    "backpack": {
        0: "Есть",
        -1: "Непонятно",
        1: "Нет"
    },
    "bag": {
        -1: "trash",
        0: "hand_bag",
        1: "shoulder_bag",
        2: "suitcase",
        3: "other",
    }
}

In [15]:
def SplitMatrix(multiclass_matrix, markup, models_cnt=4) -> list(pd.DataFrame()):
    empty = pd.DataFrame(columns=['filename', 'bag_pr', 'backpack_pr', 'bag_gt', 'backpack_gt'])
    output = [empty.copy() for _ in range(models_cnt)]
    
    attributes = ['Bag', 'Bakpack']
    # Add info in dataframe
    for model_id in range(models_cnt):
        for filename in tqdm(list(markup.keys())):
            # predicted_values
            pr_labels = [multiclass_matrix[attribute][filename][str(model_id)][0] \
                         for attribute in attributes]
            gt_labels = [label for label in markup[filename][0]]
            row = [filename] + pr_labels + gt_labels
            output[model_id].loc[len(output[model_id].index)] = row
    return output

In [10]:
multiclass_matrix = json.load(open("generated/multiclass_matrix.json"))
markup = json.load(open(os.path.join(dataset_path, "..", "markup.json")))

models_stats = SplitMatrix(multiclass_matrix, markup)

100%|██████████| 20953/20953 [00:55<00:00, 378.33it/s]
100%|██████████| 20953/20953 [00:55<00:00, 380.47it/s]
100%|██████████| 20953/20953 [00:55<00:00, 379.18it/s]
100%|██████████| 20953/20953 [00:54<00:00, 384.30it/s]


In [16]:
descr = {
    'bag': ['hand_bag', 'shoulder_bag', 'suitcase', 'nothing'],
    'backpack': ['Есть', 'Нет']
}

def Convert(model_stat):
    for attribute in attributes_list:
        model_stat[f"{attribute}_gt"] = model_stat[f"{attribute}_gt"].apply(lambda x: mapping[attribute][x])
        model_stat[f"{attribute}_pr"] = model_stat[f"{attribute}_pr"].apply(lambda x: mapping[attribute][x])   

def CalcStats(model_stat, model_id):
    output_row = [f"Model-{model_id}"]
    for attribute in attributes_list:
        local = model_stat[model_stat[f"{attribute}_gt"] != -1].copy()
        pr_values = local[f"{attribute}_pr"].tolist()
        gt_values = local[f"{attribute}_gt"].tolist()
        macc = balanced_accuracy_score(gt_values, pr_values)
        output_row.append(macc)
    return output_row

In [17]:
stats = pd.DataFrame(columns=['model', 'bag', 'backpack'])

for i, model_stat in enumerate(models_stats):  
    stats.loc[len(stats.index)] = CalcStats(model_stat, i)

stats.loc[len(stats.index)] = CalcStats(results, "Ens")
stats.head()

Unnamed: 0,model,bag,backpack
0,Model-0,0.533489,0.758107
1,Model-1,0.474656,0.684778
2,Model-2,0.487699,0.745051
3,Model-3,0.509613,0.703199
4,Model-Ens,0.534183,0.771195


In [None]:
stats.to_csv("stats/majority_voting.csv")