# sub-n-classes, Tsallis Entropy labelling

In [1]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RF
import time
from numpy.random import *

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action = 'ignore', category = FutureWarning)

In [2]:
def randints_nodup(k):
    ns = []
    while (len(ns) < k):
        n = randint(0, 10)
        if not n in ns:
            ns.append(n)
    return list(ns) 

In [3]:
# entropy labelling
def tsallis_ent_label(q, probas, s_cls):
    elements = np.power(probas, q - 1)
    # thrshld in tsallis entropy model
    ts_thrshld = np.sum(np.power(probas, q))
    # クラス数変えると，probaでのindexがそのままlabelとして使えないのでs_cls内を参照させる
    labels = np.array([s_cls[i] for i, e in enumerate(elements) if e >= ts_thrshld])
    if len(labels) == 0:
        print("no label")
    
    return labels

#### ラベルづけからその正解率を評価まで行う関数

In [4]:
# labelling and evaluating them
def tsallis_ent_scls_eval(q, classes_num, orig_A, lim_A):

    # sortされてないとpredict_probaとるときに整合性とれなくなる
    s_cls = randints_nodup(classes_num)
    # sort()は破壊的変更を行うだけで，sortされたlistそのものが返ってくるのではない
    s_cls.sort()
    print(s_cls)

    # 選ばれたクラスだけのデータとラベルを用意
    trn_imgs = [img for i, img in enumerate(train_imgs) if train_labels[i] in s_cls]
    trn_labels = [label for label in train_labels if label in s_cls]

    tst_imgs = [img for i, img in enumerate(test_imgs) if test_labels[i] in s_cls]
    tst_labels = [label for label in test_labels if label in s_cls]

    # 2クラスの場合，選ばれたクラス次第では，というかほとんどの組み合わせでdiv by 0になってしまうらしい
    # generate an annotator
    a1_model = LR().fit(trn_imgs[:orig_A], trn_labels[:orig_A])
    a1_proba = a1_model.predict_proba(trn_imgs[orig_A:orig_A + lim_A])

    # entropy labelling
    mul_labels = [tsallis_ent_label(q, probas, s_cls) for probas in a1_proba]
    
    # labels score evaluation
    score = 0
    for labels, t_label in zip(mul_labels, trn_labels[orig_A:orig_A + lim_A]):
        for l in labels:
            if (l == t_label):
                score += 1

    m_labels = []
    for labels in mul_labels:
         [m_labels.append(l) for l in labels]                
                    
    return (len(m_labels)/lim_A, score*100/len(m_labels), score*100/lim_A) 

## MNIST

In [5]:
from keras.datasets import mnist

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_imgs = np.array([x.ravel() for x in train_images])
test_imgs = np.array([y.ravel() for y in test_images])

img_SIZE = train_images.shape[1]*train_images.shape[2]

Using TensorFlow backend.


### labels evaluation

In [6]:
classes_num = 5
orig_A1, lim_A1 = 2000, 2000
q = 3/2
(ave_l_num, labels_qual, labels_qual2) = tsallis_ent_scls_eval(q, classes_num, orig_A1, lim_A1)
print(f"quality of labels generated: {labels_qual}[%], {labels_qual2}[%](as groups)")
print(f"average number of labels per image: {ave_l_num}")

[3, 4, 5, 6, 8]
quality of labels generated: 88.80670611439842[%], 90.05[%](as groups)
average number of labels per image: 1.014


## CIFAR 10

In [10]:
from keras.datasets import cifar10

(train_images, train_labels), (test_images, test_labels) = cifar10.load_data()

# number of data samples
train_N, test_N = train_images.shape[0], test_images.shape[0]
# data normalisation and vecotrise
train_imgs, test_imgs = train_images.reshape(train_N, -1)/255, test_images.reshape(test_N, -1)/255
# labels ravel
train_labels, test_labels = train_labels.ravel(), test_labels.ravel()

img_SIZE = train_images.shape[1]*train_images.shape[2]*train_images.shape[3]

### labels evaluation

In [7]:
classes_num = 3
orig_A2, lim_A2 = 2000, 2000
q = 3/2
(ave_l_num, labels_qual, labels_qual2) = tsallis_ent_scls_eval(q, classes_num, orig_A2, lim_A2)
print(f"quality of labels generated: {labels_qual}[%], {labels_qual2}[%](as groups)")
print(f"average number of labels per image: {ave_l_num}")

[1, 6, 8]
quality of labels generated: 95.70214892553723[%], 95.75[%](as groups)
average number of labels per image: 1.0005


## CIFAR100

In [8]:
from keras.datasets import cifar100

(train_images, train_labels), (test_images, test_labels) = cifar100.load_data(label_mode = "fine")

# number of data samples
train_N, test_N = train_images.shape[0], test_images.shape[0]
# data normalisation and vecotrise
train_imgs, test_imgs = train_images.reshape(train_N, -1)/255, test_images.reshape(test_N, -1)/255
# labels ravel
train_labels, test_labels = train_labels.ravel(), test_labels.ravel()

img_SIZE = train_images.shape[1]*train_images.shape[2]*train_images.shape[3]

### labels evaluation

In [9]:
classes_num = 3
orig_A3, lim_A3 = 500, 500
q = 3/2
(ave_l_num, labels_qual, labels_qual2) = tsallis_ent_scls_eval(q, classes_num, orig_A3, lim_A3)
print(f"quality of labels generated: {labels_qual}[%], {labels_qual2}[%](as groups)")
print(f"average number of labels per image: {ave_l_num}")

[1, 2, 9]
quality of labels generated: 66.27450980392157[%], 67.6[%](as groups)
average number of labels per image: 1.02
