In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Date    : Jun-26-21 00:07
# @Author  : Kan HUANG (kan.huang@connect.ust.hk)
# Ref: https://stackoverflow.com/questions/46009619/keras-weighted-binary-crossentropy
# Ref: https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
import os
import json
import numpy as np
import pandas as pd
from sklearn.utils import class_weight


In [3]:
print("Load Config ...")
with open('./config/config_linux.json', 'r') as f:
    CONFIG = json.load(f)
ROOT_PATH = CONFIG["ROOT_PATH"]
print(f"ROOT_PATH: {ROOT_PATH}")
ROOT_PATH = os.path.expanduser(ROOT_PATH)
print(f"ROOT_PATH: {ROOT_PATH}")
TRAIN_DATA_DIR = os.path.join(ROOT_PATH, CONFIG["TRAIN_DATA_DIR"])
print(f"TRAIN_DATA_DIR: {TRAIN_DATA_DIR}")

Load Config ...


FileNotFoundError: [Errno 2] No such file or directory: './config/config_linux.json'

In [11]:
print("Prepare Data Frame...")
classes = ["good_0", "bad_1"]
categories = []
filenames = []
for c in classes:
    _filenames = os.listdir(os.path.join(TRAIN_DATA_DIR,c))
    for filename in _filenames:
        filenames.append(filename)
        category = c
        if category == "good_0":
            categories.append(0)
        elif category == "bad_1":
            categories.append(1)
df = pd.DataFrame({
    'filename': filenames,
    'category': categories
})
# df["category"] = df["category"].replace({0: 'cat', 1: 'dog'})
# random.shuffle(filenames)
df

Prepare Data Frame...


Unnamed: 0,filename,category
0,4136e69a063167a4ac6f72a4ce25510a.jpg,0
1,ac2bce8a4a6b7a538ffe97337920224e.jpg,0
2,6f31d57b24330a608e0d78fafb85e62f.jpg,0
3,52c87374140f738a466ee26d2995f017.jpg,0
4,8afa3f628db0cabeb75fb2fa228637c3.jpg,0
...,...,...
29995,65dd7bfbf99520c4d5a5864e0ee1af29.jpg,1
29996,5b58fb0c26ab97bbc8b0ebf8ccd18590.jpg,1
29997,afbc1b03096b2c3c54c2a5bd50b4197a.jpg,1
29998,516a90603731c0d0cbc49677d8775635.jpg,1


In [13]:
neg, pos = np.bincount(df['category'])
print(f"neg: {neg}")
print(f"pos: {pos}")

neg: 27000
pos: 3000


In [15]:
# Calculate the weights for each class so that we can balance the data
weights = class_weight.compute_class_weight('balanced',np.unique(df['category']),df['category'])
print(f"weights: {weights}.")
# n_samples / (n_classes * np.bincount(y))

weights: [0.55555556 5.        ].


In [19]:
# 跟 TensorFlow 的实现实际上是等价的
total = 30000
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print(f"class_weight: {class_weight}.")

class_weight: {0: 0.5555555555555556, 1: 5.0}.


In [21]:
5.0/9

0.5555555555555556

In [23]:
150/(60/10)

25.0

In [24]:
150/(60/3)

7.5

In [25]:
class_weight = {0: 1.0/101, 1: 100.0/101}

In [26]:
class_weight

{0: 0.009900990099009901, 1: 0.9900990099009901}