## Теория решеток для анализа данных

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import pandas as pd

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
column_names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]

car_data = pd.read_csv('car.data', header=None, names = column_names)

In [3]:
car_data[0:7]

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc


In [4]:
car_arr = np.array(car_data)

In [5]:
car_arr[:, -1]

array(['unacc', 'unacc', 'unacc', ..., 'unacc', 'good', 'vgood'],
      dtype=object)

### Способ 1: рассматривается бинарная классификация

unacc -> negative, else -> positive

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

#X_train, X_test, y_train, y_test = train_test_split(car_arr[:, :-1], car_arr[:, -1], test_size=0.3, random_state=42)

Добавляю кросс-валидацию:

In [7]:
def enumerate_attr(arr):
    res = set([str(i) + ':' + str(elem) for i, elem in zip(np.arange(1, 7), arr)])
    return res

In [329]:
k_fold = KFold(n_splits=7, shuffle=True, random_state=None)
ind = 1

for train_index, test_index in k_fold.split(car_data):
    X_train, X_test = pd.DataFrame(car_arr[train_index]), pd.DataFrame(car_arr[test_index])
    X_train.to_csv('car_train_' + str(ind)+'.csv', index=False, header=False)
    X_test.to_csv('car_test_' + str(ind)+'.csv', index=False, header=False)
    ind += 1

In [330]:
def make_sample(cur_ind):
    cur_name = "car_train_" + str(cur_ind) + ".csv"
    cur_data = pd.read_csv(cur_name, header=None, names = column_names)
    cur_arr = np.array(cur_data)
    
    positive = []
    negative = []
    
    for elem in cur_arr:
        if (elem[-1] == "unacc"):
            negative.append(enumerate_attr(elem))
        else:
            positive.append(enumerate_attr(elem))
            
    positive = np.array(positive)
    negative = np.array(negative)
        
    #return positive[:, :-1], negative[:, :-1]
    return positive, negative

In [331]:
positive, negative = make_sample(1)

In [332]:
def make_tests(cur_ind):
    cur_name = "car_test_" + str(cur_ind) + ".csv"
    cur_data = pd.read_csv(cur_name, header=None, names = column_names)
    cur_arr = np.array(cur_data)
    
    X_test_0 = np.array(cur_arr[:, :-1])
    y_test_0 = np.array(cur_arr[:, -1])
    y_test = []
    X_test = []
    
    for elem in y_test_0:
        if (elem == "unacc"):
            y_test.append(0)
        else:
            y_test.append(1)
            
    for elem in X_test_0:
        X_test.append(enumerate_attr(elem))
    
    y_test = np.array(y_test)
    X_test = np.array(X_test)
    
    return X_test, y_test

In [333]:
X_test, y_test = make_tests(1)

### Алгоритм 1

<br>

Рассматриваем пересечение описания классифицируемого объекта с описанием каждого объекта из плюс-контекста. 

Если оно не вкладывается в описание ни одного из минус-примеров (доп: может вкладываться максимум в num_misses примеров), то начисляем голос, равный мощности пересечения. 

Аналогично делаем и для минус-примеров. 

Рассматриваем также threshold (среди всех голосов максимальный сравнивается с этим значением threshold: если он больше, то присваиваем метку соответствующего класса, а если меньше, то присваиваем неопределенную метку).

In [13]:
def get_intersect(elem1, elem2):
    res = []
    for i in range(len(elem1)):
        if (elem1[i] == elem2[i]):
            res.append(elem1[i])
    return np.array(res)


def is_elem_superset(small_elem, big_elem):
    res = 1
    for i in small_elem:
        if i not in big_elem:
            res = 0
            return 0
    return res


def algorithm_1(positive, negative, x_test, threshold = 0, num_misses = 0):
    y_pred = []
    
    for i in range(len(x_test)):
        elem = np.array(x_test[i])
        
        pos_voices = 0
        neg_voices = 0
        
        for pos_elem in positive:
            pos_elem = np.array(pos_elem)
            cur_intersec = pos_elem & elem #get_intersect(pos_elem, elem)
            
            misses = 0
                        
            for neg_elem in negative:
                #if (neg_elem.issuperset(cur_intersec)):
                if (is_elem_superset(cur_intersec, neg_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break

            if (misses <= num_misses):
                pos_voices += float(len(cur_intersec) / len(positive[0]))
                    
        for neg_elem in negative:
            cur_intersec = neg_elem & elem #get_intersect(neg_elem, elem)
            
            misses = 0
            
            for pos_elem in positive:
                if (is_elem_superset(cur_intersec, pos_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            if (misses <= num_misses):
                neg_voices += float(len(cur_intersec) / len(negative[0]))
    
        pos_voices /= len(positive)
        neg_voices /= len(negative)
        
        pos_part = 0
        neg_part = 0
        
        if (pos_voices + neg_voices > 0):
            pos_part = pos_voices / (pos_voices + neg_voices)
            neg_part = neg_voices / (pos_voices + neg_voices)
        
        if ((pos_voices >= neg_voices) and (pos_part >= threshold)):
            y_pred.append(1)
            
        else:
            if ((neg_voices >= pos_voices) and (neg_part >= threshold)):
                y_pred.append(0)
                
            else:
                # если не достигли значения threshold
                y_pred.append('unknown')
        
    y_pred = np.array(y_pred)
    return y_pred

Тестируем этот алгоритм

In [335]:
total_acc = 0
total_precision = 0
total_recall = 0
total_f1 = 0

for i in range(1, 8):
    cur_positive, cur_negative = make_sample(i)
    cur_x_test, cur_y_test = make_tests(i)
    cur_y_pred = algorithm_1(cur_positive, cur_negative, cur_x_test, threshold=0, num_misses=0)
    cur_acc = accuracy_score(cur_y_test, cur_y_pred)
    print(i, " :  accuracy = ", cur_acc)
    
    cur_prec = precision_score(cur_y_test, cur_y_pred)
    cur_rec = recall_score(cur_y_test, cur_y_pred)
    cur_f1 = f1_score(cur_y_test, cur_y_pred)
    
    total_acc += cur_acc
    total_precision += cur_prec
    total_recall += cur_rec
    total_f1 += cur_f1
    
print('-------------')
print('total accuracy = ', total_acc / 7)
print('total precision = ', total_precision / 7)
print('total recall = ', total_recall / 7)
print('total F1 = ', total_f1 / 7)

1  :  accuracy =  0.9919028340080972
2  :  accuracy =  0.9919028340080972
3  :  accuracy =  0.9878542510121457
4  :  accuracy =  0.9878542510121457
5  :  accuracy =  1.0
6  :  accuracy =  1.0
7  :  accuracy =  0.9959349593495935
-------------
total accuracy =  0.9936355899128685
total precision =  0.9862614393773954
total recall =  0.9921404374140692
total F1 =  0.9890364338731111


Чуть позже сделан анализ этого алгоритма в зависимости от значений двух параметров.

### Алгоритм 2

<br>

Этот алгоритм уже немного сложнее: тут рассматривается еще и минимальная поддержка (supp).

In [336]:
def count_supp(intersec, big_set):
    res = 0
    for one_elem in big_set:
        if one_elem.issuperset(intersec):
            res += 1
    return res


def algorithm_2(positive, negative, x_test, threshold = 0, num_misses = 0, min_supp_part = 0):
    y_pred = []
    
    for i in range(len(x_test)):
        elem = np.array(x_test[i])
        
        pos_voices = 0
        neg_voices = 0
        
        min_supp_pos = int(min_supp_part * len(positive))
        min_supp_neg = int(min_supp_part * len(negative))
        
        for pos_elem in positive:
            pos_elem = np.array(pos_elem)
            cur_intersec = pos_elem & elem
            
            misses = 0
                        
            for neg_elem in negative:
                #if (neg_elem.issuperset(cur_intersec)):
                if (is_elem_superset(cur_intersec, neg_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break

            if (misses <= num_misses):
                if (count_supp(cur_intersec, positive) > min_supp_pos):
                    pos_voices += float(len(cur_intersec) / len(positive[0]))
                    
        for neg_elem in negative:
            cur_intersec = neg_elem & elem #get_intersect(neg_elem, elem)
            
            misses = 0
            
            for pos_elem in positive:
                #if (pos_elem.issuperset(cur_intersec)):
                if (is_elem_superset(cur_intersec, pos_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            if (misses <= num_misses):
                    if (count_supp(cur_intersec, negative) > min_supp_neg):
                        neg_voices += float(len(cur_intersec) / len(negative[0]))
    
        pos_voices /= len(positive)
        neg_voices /= len(negative)
        
        pos_part = 0
        neg_part = 0
        
        if (pos_voices + neg_voices > 0):
            pos_part = pos_voices / (pos_voices + neg_voices)
            neg_part = neg_voices / (pos_voices + neg_voices)
        
        if ((pos_voices >= neg_voices) and (pos_part >= threshold)):
            y_pred.append(1)
            
        else:
            if ((neg_voices >= pos_voices) and (neg_part >= threshold)):
                y_pred.append(0)
                
            else:
                # если не достигли значения threshold
                y_pred.append('unknown')
        
    y_pred = np.array(y_pred)
    return y_pred

In [337]:
total_acc = 0
total_precision = 0
total_recall = 0
total_f1 = 0

for i in range(1, 8):
    cur_positive, cur_negative = make_sample(i)
    cur_x_test, cur_y_test = make_tests(i)
    cur_y_pred = algorithm_2(cur_positive, cur_negative, cur_x_test, threshold=0, num_misses=0, min_supp_part = 0.001)
    cur_acc = accuracy_score(cur_y_test, cur_y_pred)
    print(i, " :  accuracy = ", cur_acc)
    
    cur_prec = precision_score(cur_y_test, cur_y_pred)
    cur_rec = recall_score(cur_y_test, cur_y_pred)
    cur_f1 = f1_score(cur_y_test, cur_y_pred)
    
    total_acc += cur_acc
    total_precision += cur_prec
    total_recall += cur_rec
    total_f1 += cur_f1
    
print('-------------')
print('total accuracy = ', total_acc / 7)
print('total precision = ', total_precision / 7)
print('total recall = ', total_recall / 7)
print('total F1 = ', total_f1 / 7)

1  :  accuracy =  0.9959514170040485
2  :  accuracy =  0.9919028340080972
3  :  accuracy =  0.9878542510121457
4  :  accuracy =  0.9878542510121457
5  :  accuracy =  1.0
6  :  accuracy =  1.0
7  :  accuracy =  1.0
-------------
total accuracy =  0.9947946790052054
total precision =  0.9862614393773954
total recall =  0.9960363020064512
total F1 =  0.9910141336278622


### Алгоритм 3

<br>

Этот алгоритм похож на предыдущий, мы также рассматриваем минимальную поддержку, но теперь уже в качестве голоса считаем не мощность пересечения, а саму поддержку.

In [338]:
def algorithm_3(positive, negative, x_test, threshold=0, num_misses = 0, min_supp_part = 0):
    y_pred = []
    
    for i in range(len(x_test)):
        elem = np.array(x_test[i])
        
        pos_voices = 0
        neg_voices = 0
        
        min_supp_pos = int(min_supp_part * len(positive))
        min_supp_neg = int(min_supp_part * len(negative))
        
        for pos_elem in positive:
            pos_elem = np.array(pos_elem)
            cur_intersec = pos_elem & elem
            
            misses = 0
                        
            for neg_elem in negative:
                if (is_elem_superset(cur_intersec, neg_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break

            if (misses <= num_misses):
                cur_supp = count_supp(cur_intersec, positive)
                if (cur_supp > min_supp_pos):
                    pos_voices += cur_supp
                    
        for neg_elem in negative:
            cur_intersec = neg_elem & elem 
            
            misses = 0
            
            for pos_elem in positive:
                if (is_elem_superset(cur_intersec, pos_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            if (misses <= num_misses):
                cur_supp = count_supp(cur_intersec, negative)
                if (cur_supp > min_supp_neg):
                    neg_voices += cur_supp
    
        pos_voices /= len(positive)
        neg_voices /= len(negative)
        
        pos_part = 0
        neg_part = 0
        
        if (pos_voices + neg_voices > 0):
            pos_part = pos_voices / (pos_voices + neg_voices)
            neg_part = neg_voices / (pos_voices + neg_voices)
        
        if ((pos_voices >= neg_voices) and (pos_part >= threshold)):
            y_pred.append(1)
            
        else:
            if ((neg_voices >= pos_voices) and (neg_part >= threshold)):
                y_pred.append(0)
                
            else:
                # если не достигли значения threshold
                y_pred.append('unknown')
        
    y_pred = np.array(y_pred)
    return y_pred

In [339]:
total_acc = 0
total_precision = 0
total_recall = 0
total_f1 = 0

for i in range(1, 8):
    cur_positive, cur_negative = make_sample(i)
    cur_x_test, cur_y_test = make_tests(i)
    cur_y_pred = algorithm_3(cur_positive, cur_negative, cur_x_test, threshold=0, num_misses=0, min_supp_part = 0.001)
    cur_acc = accuracy_score(cur_y_test, cur_y_pred)
    print(i, " :  accuracy = ", cur_acc)
    
    cur_prec = precision_score(cur_y_test, cur_y_pred)
    cur_rec = recall_score(cur_y_test, cur_y_pred)
    cur_f1 = f1_score(cur_y_test, cur_y_pred)
    
    total_acc += cur_acc
    total_precision += cur_prec
    total_recall += cur_rec
    total_f1 += cur_f1
    
print('-------------')
print('total accuracy = ', total_acc / 7)
print('total precision = ', total_precision / 7)
print('total recall = ', total_recall / 7)
print('total F1 = ', total_f1 / 7)

1  :  accuracy =  0.9959514170040485
2  :  accuracy =  0.9919028340080972
3  :  accuracy =  0.9919028340080972
4  :  accuracy =  0.9878542510121457
5  :  accuracy =  1.0
6  :  accuracy =  1.0
7  :  accuracy =  1.0
-------------
total accuracy =  0.995373048004627
total precision =  0.988216024028157
total recall =  0.9960363020064512
total F1 =  0.9920269879406852


## Сравнение параметров

Буду считать 3 метрики: accuracy, precision, CO (classified objects).

In [15]:
def clear_objects(pred, test):
    y_pred = []
    y_test = []
    classified = 0
    
    for i in range(len(pred)):
        if (pred[i] != 'unknown'):
            y_pred.append(int(pred[i]))
            y_test.append(test[i])
            classified += 1
    
    y_pred = np.array(y_pred)
    y_test = np.array(y_test)
    
    return y_pred, y_test, float(classified / len(pred))

### Алгоритм 1

In [341]:
def compare_algo_1(cur_threshold, cur_num_misses):
    total_acc = 0
    total_precision = 0

    for i in range(1, 8):
        cur_positive, cur_negative = make_sample(i)
        cur_x_test, cur_y_test = make_tests(i)
        cur_y_pred = algorithm_1(cur_positive, cur_negative, cur_x_test, threshold=cur_threshold, num_misses=cur_num_misses)
        clear_y_pred, clear_y_test, classified = clear_objects(cur_y_pred, cur_y_test)
        cur_acc = accuracy_score(clear_y_test, clear_y_pred)
        cur_precision = precision_score(clear_y_test, clear_y_pred)
    
        total_acc += cur_acc
        total_precision += cur_precision
    
    print('threshold =', cur_threshold, 'num_misses =', cur_num_misses, ' :')
    print('total accuracy = ', total_acc / 7)
    print('total precision = ', total_precision / 7)
    print('classified objects = ', classified)
    print('-------------')

In [345]:
thresholds = [0.5, 0.55, 0.6, 0.7]
num_missess = [0, 1, 3, 5]

for tr in thresholds:
    for mis in num_missess:
        compare_algo_1(tr, mis)

threshold = 0.5 num_misses = 0  :
total accuracy =  0.994209256725303
total precision =  0.9881640268964212
classified objects =  1.0
-------------
threshold = 0.5 num_misses = 1  :
total accuracy =  0.8049838479876991
total precision =  0.6072851087322985
classified objects =  1.0
-------------
threshold = 0.5 num_misses = 3  :
total accuracy =  0.8049838479876991
total precision =  0.6072851087322985
classified objects =  1.0
-------------
threshold = 0.5 num_misses = 5  :
total accuracy =  0.8049838479876991
total precision =  0.6072851087322985
classified objects =  1.0
-------------
threshold = 0.55 num_misses = 0  :
total accuracy =  0.9941879234425374
total precision =  0.9881640268964212
classified objects =  0.991869918699187
-------------
threshold = 0.55 num_misses = 1  :
total accuracy =  0.9395540123477686
total precision =  0.8946447198294332
classified objects =  0.5447154471544715
-------------
threshold = 0.55 num_misses = 3  :
total accuracy =  0.9395540123477686
tota

### Алгоритм 2

In [346]:
def compare_algo_2(cur_threshold, cur_num_misses, cur_supp_part):
    total_acc = 0
    total_precision = 0

    for i in range(1, 8):
        cur_positive, cur_negative = make_sample(i)
        cur_x_test, cur_y_test = make_tests(i)
        cur_y_pred = algorithm_2(cur_positive, cur_negative, cur_x_test, threshold=cur_threshold, num_misses=cur_num_misses, min_supp_part = cur_supp_part)
        clear_y_pred, clear_y_test, classified = clear_objects(cur_y_pred, cur_y_test)
        cur_acc = accuracy_score(clear_y_test, clear_y_pred)
        cur_precision = precision_score(clear_y_test, clear_y_pred)
    
        total_acc += cur_acc
        total_precision += cur_precision
    
    print('threshold =', cur_threshold, 'num_misses =', cur_num_misses, 'min_supp_part =', cur_supp_part, ' :')
    print('total accuracy = ', total_acc / 7)
    print('total precision = ', total_precision / 7)
    print('classified objects = ', classified)
    print('-------------')

In [347]:
thresholds = [0.5, 0.55, 0.6]
num_missess = [0, 1, 3]
supports = [0, 0.001, 0.005]

for tr in thresholds:
    for mis in num_missess:
        for supp in supports:
            compare_algo_2(tr, mis, supp)

threshold = 0.5 num_misses = 0 min_supp_part = 0  :
total accuracy =  0.994209256725303
total precision =  0.9881640268964212
classified objects =  1.0
-------------
threshold = 0.5 num_misses = 0 min_supp_part = 0.001  :
total accuracy =  0.9953659947241462
total precision =  0.9881640268964212
classified objects =  0.9959349593495935
-------------
threshold = 0.5 num_misses = 0 min_supp_part = 0.005  :
total accuracy =  0.994093266383567
total precision =  0.9839202369994509
classified objects =  0.9349593495934959
-------------
threshold = 0.5 num_misses = 1 min_supp_part = 0  :
total accuracy =  0.9832155435493046
total precision =  0.9528913310870407
classified objects =  1.0
-------------
threshold = 0.5 num_misses = 1 min_supp_part = 0.001  :
total accuracy =  0.9832178946427982
total precision =  0.9513765612061234
classified objects =  1.0
-------------
threshold = 0.5 num_misses = 1 min_supp_part = 0.005  :
total accuracy =  0.9865452349336573
total precision =  0.96952026047

### Алгоритм 3

In [348]:
def compare_algo_3(cur_threshold, cur_num_misses, cur_supp_part):
    total_acc = 0
    total_precision = 0

    for i in range(1, 8):
        cur_positive, cur_negative = make_sample(i)
        cur_x_test, cur_y_test = make_tests(i)
        cur_y_pred = algorithm_3(cur_positive, cur_negative, cur_x_test, threshold=cur_threshold, num_misses=cur_num_misses, min_supp_part = cur_supp_part)
        clear_y_pred, clear_y_test, classified = clear_objects(cur_y_pred, cur_y_test)
        cur_acc = accuracy_score(clear_y_test, clear_y_pred)
        cur_precision = precision_score(clear_y_test, clear_y_pred)
    
        total_acc += cur_acc
        total_precision += cur_precision
    
    print('threshold =', cur_threshold, 'num_misses =', cur_num_misses, 'min_supp_part =', cur_supp_part, ' :')
    print('total accuracy = ', total_acc / 7)
    print('total precision = ', total_precision / 7)
    print('classified objects = ', classified)
    print('-------------')

In [349]:
thresholds = [0.5, 0.55, 0.6]
num_missess = [0, 1, 3]
supports = [0, 0.001, 0.005]

for tr in thresholds:
    for mis in num_missess:
        for supp in supports:
            compare_algo_3(tr, mis, supp)

threshold = 0.5 num_misses = 0 min_supp_part = 0  :
total accuracy =  0.9947876257247247
total precision =  0.9901186115471831
classified objects =  1.0
-------------
threshold = 0.5 num_misses = 0 min_supp_part = 0.001  :
total accuracy =  0.9959443637235678
total precision =  0.9901186115471831
classified objects =  0.9959349593495935
-------------
threshold = 0.5 num_misses = 0 min_supp_part = 0.005  :
total accuracy =  0.9964674460794195
total precision =  0.9919034419034419
classified objects =  0.9349593495934959
-------------
threshold = 0.5 num_misses = 1 min_supp_part = 0  :
total accuracy =  0.984372281548148
total precision =  0.9647001439840753
classified objects =  1.0
-------------
threshold = 0.5 num_misses = 1 min_supp_part = 0.001  :
total accuracy =  0.9832155435493046
total precision =  0.9611293175832308
classified objects =  1.0
-------------
threshold = 0.5 num_misses = 1 min_supp_part = 0.005  :
total accuracy =  0.9853742941748784
total precision =  0.9714016100

## Способ 2

<br>

Теперь рассматриваю не бинарную классификацию, а смотрю на все 4 класса. Однако и нецелевые признаки рассматриваю иначе: теперь каждый из них представляется в следующем виде, описанном ниже.

Логика такая: $-high$ означает, что значение признака меньше или равно $high$, а $+high$ означает, что значение больше или равно $high$.

Так, для каждого из изначальных значений признаков сопоставлю 2 новых (cначала везде делала 3, но это слишком много):

$low = \{-low, -med\}$ 

$med = \{-med, +med\}$ 

$high = \{+med, +high \}$ 

$vhigh = \{+high, +vhigh \}$ 

Это для 1 и 2 признаков.

<br>

Для 3-го аналогично (но уже 2 новых):

$2 = \{-2, -3\}$

$3 = \{-3, -4\}$

$4 = \{-4, -5more\}$

$5more = \{-5more +5more\}$

<br>

Для 4-го:

$2 = \{-2, -4\}$

$4 = \{-4, -more\}$

$more = \{-more +more\}$

<br>

Для 5-го (симметрично):

$small = \{-small, -med\}$

$med = \{-med, +med\}$

$big = \{+med, +big\}$

<br>

Для 6-го:

$low = \{-low, -med\}$

$med = \{-med, +med\}$

$high = \{+med, +high\}$

In [8]:
def change_attr_complex(arr):
    res_set = set()
    
    # buying
    if (arr[0] == 'low'):
        res_set |= {'1:-low', '1:-med'}
    else:
        if (arr[0] == 'med'):
            res_set |= {'1:-med', '1:+med'}
        else:
            if (arr[0] == 'high'):
                res_set |= {'1:+med', '1:+high'}
            else:
                res_set |= {'1:+high', '1:+vhigh'}
            
    # maint
    if (arr[1] == 'low'):
        res_set |= {'2:-low', '2:-med'}
    else:
        if (arr[1] == 'med'):
            res_set |= {'2:-med', '2:+med'}
        else:
            if (arr[1] == 'high'):
                res_set |= {'2:+med', '2:+high'}
            else:
                res_set |= {'2:+high', '2:+vhigh'}
    
    # doors
    if (arr[2] == '2'):
        res_set |= {'3:-2', '3:-3'}
    else:
        if (arr[2] == '3'):
            res_set |= {'3:-3', '3:-4'}
        else:
            if (arr[2] == '4'):
                res_set |= {'3:-4', '3:-5more'}
            else:
                res_set |= {'3:-5more', '3:+5more'}
    
    # persons
    if (arr[3] == '2'):
        res_set |= {'4:-2', '4:-4'}
    else:
        if (arr[3] == '4'):
            res_set |= {'4:-4', '4:-more'}
        else:
            res_set |= {'4:-more', '4:+more'}
    
    # lug_boot
    if (arr[4] == 'small'):
        res_set |= {'5:-small', '5:-med'}
    else:
        if (arr[4] == 'med'):
            res_set |= {'5:-med', '5:+med'}
        else:
            res_set |= {'5:+med', '5:+big'}  
            
    # safety
    if (arr[5] == 'low'):
        res_set |= {'6:-low', '6:-med'}
    else:
        if (arr[5] == 'med'):
            res_set |= {'6:-med', '6:+med'}
        else:
            res_set |= {'6:+med', '6:+high'}  
                    
    return res_set
    

# Преобразовываем признаки
def make_complex_sample(cur_ind):
    cur_name = "car_train_" + str(cur_ind) + ".csv"
    cur_data = pd.read_csv(cur_name, header=None, names = column_names)
    cur_arr = np.array(cur_data)
    
    unacc_data = []
    acc_data = []
    good_data = []
    vgood_data = []
    
    for elem in cur_arr:
        if (elem[-1] == "unacc"):
            unacc_data.append(change_attr_complex(elem))
        else:
            if (elem[-1] == "acc"):
                acc_data.append(change_attr_complex(elem))
            else:
                if (elem[-1] == "good"):
                    good_data.append(change_attr_complex(elem))
                else:
                    vgood_data.append(change_attr_complex(elem))
            
    unacc_data = np.array(unacc_data)
    acc_data = np.array(acc_data)
    good_data = np.array(good_data)
    vgood_data = np.array(vgood_data)
        
    return unacc_data, acc_data, good_data, vgood_data

In [9]:
def make_tests_complex(cur_ind):
    cur_name = "car_test_" + str(cur_ind) + ".csv"
    cur_data = pd.read_csv(cur_name, header=None, names = column_names)
    cur_arr = np.array(cur_data)
    
    X_test_0 = np.array(cur_arr[:, :-1])
    y_test_0 = np.array(cur_arr[:, -1])
    y_test = []
    X_test = []
    
    for elem in y_test_0:
        if (elem == "unacc"):
            y_test.append(0)
        else:
            if (elem == "acc"):
                y_test.append(1)
            else:
                if (elem == "good"):
                    y_test.append(2)
                else:
                    y_test.append(3)
            
    for elem in X_test_0:
        X_test.append(change_attr_complex(elem))
    
    y_test = np.array(y_test)
    X_test = np.array(X_test)
    
    return X_test, y_test

### Алгоритм 4

In [351]:
def algorithm_4(unacc_d, acc_d, good_d, vgood_d, x_test, threshold = 0, num_misses = 0):
    y_pred = []
    
    for i in range(len(x_test)):
        elem = np.array(x_test[i])
        
        voices_0 = 0
        voices_1 = 0
        voices_2 = 0
        voices_3 = 0
        
        for elem_0 in unacc_d:
            elem_0 = np.array(elem_0)
            cur_intersec = elem_0 & elem 
            
            misses = 0
                        
            for other_elem in acc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            for other_elem in good_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
                        
            for other_elem in vgood_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break

            if (misses <= num_misses):
                voices_0 += float(len(cur_intersec) / len(unacc_d[0]))
        
        for elem_1 in acc_d:
            elem_1 = np.array(elem_1)
            cur_intersec = elem_1 & elem 
            
            misses = 0
                        
            for other_elem in unacc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            for other_elem in good_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
                        
            for other_elem in vgood_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break

            if (misses <= num_misses):
                voices_1 += float(len(cur_intersec) / len(acc_d[0]))
        
        for elem_2 in good_d:
            elem_2 = np.array(elem_2)
            cur_intersec = elem_2 & elem 
            
            misses = 0
                        
            for other_elem in unacc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            for other_elem in acc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
                        
            for other_elem in vgood_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break

            if (misses <= num_misses):
                voices_2 += float(len(cur_intersec) / len(good_d[0]))
        
        for elem_3 in vgood_d:
            elem_3 = np.array(elem_3)
            cur_intersec = elem_3 & elem 
            
            misses = 0
                        
            for other_elem in unacc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            for other_elem in acc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
                        
            for other_elem in good_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break

            if (misses <= num_misses):
                voices_3 += float(len(cur_intersec) / len(vgood_d[0]))
                
        
        voices_0 /= len(unacc_d)
        voices_1 /= len(acc_d)
        voices_2 /= len(good_d)
        voices_3 /= len(vgood_d)
        
        voices = [voices_0, voices_1, voices_2, voices_3]
        
        res_voice = np.argmax(voices)
        w_sum = np.sum(voices)
        
        if (voices[res_voice] / w_sum >= threshold):
            y_pred.append(res_voice)
        else:
            y_pred.append('unknown')
        
    y_pred = np.array(y_pred)
    return y_pred

In [320]:
total_acc = 0

for i in range(1, 8):
    unacc, acc, good, vgood = make_complex_sample(i)
    cur_x_test, cur_y_test = make_tests_complex(i)
    cur_y_pred = algorithm_4(unacc, acc, good, vgood, cur_x_test, threshold=0, num_misses=0)
    cur_acc = accuracy_score(cur_y_test, cur_y_pred)
    print(i, " :  accuracy = ", cur_acc)
    
    total_acc += cur_acc
    
print('-------------')
print('total accuracy = ', total_acc / 7)

1  :  accuracy =  0.9838056680161943
2  :  accuracy =  0.9473684210526315
3  :  accuracy =  0.9878542510121457
4  :  accuracy =  0.9919028340080972
5  :  accuracy =  0.9757085020242915
6  :  accuracy =  0.979757085020243
7  :  accuracy =  0.967479674796748
-------------
total accuracy =  0.9762680622757643


### Алгоритм 5

In [352]:
def algorithm_5(unacc_d, acc_d, good_d, vgood_d, x_test, threshold = 0, num_misses = 0, min_supp = 0):
    y_pred = []
    
    for i in range(len(x_test)):
        elem = np.array(x_test[i])
        
        voices_0 = 0
        voices_1 = 0
        voices_2 = 0
        voices_3 = 0
        
        for elem_0 in unacc_d:
            elem_0 = np.array(elem_0)
            cur_intersec = elem_0 & elem 
            
            misses = 0
                        
            for other_elem in acc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            for other_elem in good_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
                        
            for other_elem in vgood_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break

            if (misses <= num_misses):
                if (count_supp(cur_intersec, unacc_d) > min_supp):
                    voices_0 += float(len(cur_intersec) / len(unacc_d[0]))
        
        for elem_1 in acc_d:
            elem_1 = np.array(elem_1)
            cur_intersec = elem_1 & elem 
            
            misses = 0
                        
            for other_elem in unacc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            for other_elem in good_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
                        
            for other_elem in vgood_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break

            if (misses <= num_misses):
                if (count_supp(cur_intersec, acc_d) > min_supp):
                    voices_1 += float(len(cur_intersec) / len(acc_d[0]))
        
        for elem_2 in good_d:
            elem_2 = np.array(elem_2)
            cur_intersec = elem_2 & elem 
            
            misses = 0
                        
            for other_elem in unacc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            for other_elem in acc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
                        
            for other_elem in vgood_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break

            if (misses <= num_misses):
                if (count_supp(cur_intersec, good_d) > min_supp):
                    voices_2 += float(len(cur_intersec) / len(good_d[0]))
        
        for elem_3 in vgood_d:
            elem_3 = np.array(elem_3)
            cur_intersec = elem_3 & elem 
            
            misses = 0
                        
            for other_elem in unacc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            for other_elem in acc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
                        
            for other_elem in good_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break

            if (misses <= num_misses):
                if (count_supp(cur_intersec, vgood_d) > min_supp):
                    voices_3 += float(len(cur_intersec) / len(vgood_d[0]))
                
        
        voices_0 /= len(unacc_d)
        voices_1 /= len(acc_d)
        voices_2 /= len(good_d)
        voices_3 /= len(vgood_d)
        
        voices = [voices_0, voices_1, voices_2, voices_3]
        
        res_voice = np.argmax(voices)
        w_sum = np.sum(voices)
        
        if (voices[res_voice] / w_sum >= threshold):
            y_pred.append(res_voice)
        else:
            y_pred.append('unknown')
        
    y_pred = np.array(y_pred)
    return y_pred

In [353]:
total_acc = 0

for i in range(1, 8):
    unacc, acc, good, vgood = make_complex_sample(i)
    cur_x_test, cur_y_test = make_tests_complex(i)
    cur_y_pred = algorithm_5(unacc, acc, good, vgood, cur_x_test, threshold=0, num_misses=0, min_supp=1)
    cur_acc = accuracy_score(cur_y_test, cur_y_pred)
    print(i, " :  accuracy = ", cur_acc)
    
    total_acc += cur_acc
    
print('-------------')
print('total accuracy = ', total_acc / 7)

1  :  accuracy =  0.979757085020243
2  :  accuracy =  0.979757085020243
3  :  accuracy =  0.9838056680161943
4  :  accuracy =  0.9676113360323887
5  :  accuracy =  0.9595141700404858
6  :  accuracy =  0.9959514170040485
7  :  accuracy =  0.9715447154471545
-------------
total accuracy =  0.9768487823686798


## Сравнение при разных значениях параметров

### Алгоритм 4

In [354]:
def compare_algo_4(cur_threshold, cur_num_misses):
    total_acc = 0

    for i in range(1, 8):
        unacc, acc, good, vgood = make_complex_sample(i)
        cur_x_test, cur_y_test = make_tests_complex(i)
        cur_y_pred = algorithm_4(unacc, acc, good, vgood, cur_x_test, threshold=cur_threshold, num_misses=cur_num_misses)   
        clear_y_pred, clear_y_test, classified = clear_objects(cur_y_pred, cur_y_test)
        cur_acc = accuracy_score(clear_y_test, clear_y_pred)
    
        total_acc += cur_acc
    
    print('threshold =', cur_threshold, 'num_misses =', cur_num_misses, ' :')
    print('total accuracy = ', total_acc / 7)
    print('classified objects = ', classified)
    print('-------------')

In [355]:
thresholds = [0.25, 0.3, 0.4, 0.5]
num_missess = [0, 1, 3]

for tr in thresholds:
    for mis in num_missess:
        compare_algo_4(tr, mis)

threshold = 0.25 num_misses = 0  :
total accuracy =  0.972219479279813
classified objects =  1.0
-------------
threshold = 0.25 num_misses = 1  :
total accuracy =  0.9322861562912912
classified objects =  1.0
-------------
threshold = 0.25 num_misses = 3  :
total accuracy =  0.893511922395106
classified objects =  1.0
-------------
threshold = 0.3 num_misses = 0  :
total accuracy =  0.972219479279813
classified objects =  1.0
-------------
threshold = 0.3 num_misses = 1  :
total accuracy =  0.9322861562912912
classified objects =  1.0
-------------
threshold = 0.3 num_misses = 3  :
total accuracy =  0.893511922395106
classified objects =  1.0
-------------
threshold = 0.4 num_misses = 0  :
total accuracy =  0.9727436195880826
classified objects =  0.9959349593495935
-------------
threshold = 0.4 num_misses = 1  :
total accuracy =  0.9368836354173203
classified objects =  0.991869918699187
-------------
threshold = 0.4 num_misses = 3  :
total accuracy =  0.9053800520961132
classified ob

### Алгоритм 5

In [356]:
def compare_algo_5(cur_threshold, cur_num_misses, cur_min_supp):
    total_acc = 0

    for i in range(1, 8):
        unacc, acc, good, vgood = make_complex_sample(i)
        cur_x_test, cur_y_test = make_tests_complex(i)
        cur_y_pred = algorithm_5(unacc, acc, good, vgood, cur_x_test, threshold=cur_threshold, num_misses=cur_num_misses, min_supp = cur_min_supp)
        clear_y_pred, clear_y_test, classified = clear_objects(cur_y_pred, cur_y_test)
        cur_acc = accuracy_score(clear_y_test, clear_y_pred)
        
        total_acc += cur_acc
    
    print('threshold =', cur_threshold, 'num_misses =', cur_num_misses, 'min_supp =', cur_min_supp, ' :')
    print('total accuracy = ', total_acc / 7)
    print('classified objects = ', classified)
    print('-------------')

In [357]:
thresholds = [0.25, 0.4, 0.5]
num_missess = [0, 1, 3]
supps = [0, 1, 3]

for tr in thresholds:
    for mis in num_missess:
        for supp in supps:
            compare_algo_5(tr, mis, supp)

threshold = 0.25 num_misses = 0 min_supp = 0  :
total accuracy =  0.972219479279813
classified objects =  1.0
-------------
threshold = 0.25 num_misses = 0 min_supp = 1  :
total accuracy =  0.9768487823686798
classified objects =  1.0
-------------
threshold = 0.25 num_misses = 0 min_supp = 3  :
total accuracy =  0.9785862404604382
classified objects =  1.0
-------------
threshold = 0.25 num_misses = 1 min_supp = 0  :
total accuracy =  0.9322861562912912
classified objects =  1.0
-------------
threshold = 0.25 num_misses = 1 min_supp = 1  :
total accuracy =  0.9317101383853629
classified objects =  1.0
-------------
threshold = 0.25 num_misses = 1 min_supp = 3  :
total accuracy =  0.9473284524632406
classified objects =  1.0
-------------
threshold = 0.25 num_misses = 3 min_supp = 0  :
total accuracy =  0.893511922395106
classified objects =  1.0
-------------
threshold = 0.25 num_misses = 3 min_supp = 1  :
total accuracy =  0.8958277494862862
classified objects =  1.0
-------------
th

### Итоговый алгоритм бинарной классификации

Это алгоритм № 2.

Для данного алгоритма посчитаю все остальные метрики.

In [359]:
from sklearn.metrics import confusion_matrix

In [364]:
total_acc = 0
total_precision = 0
total_recall = 0
total_f1 = 0
total_tpr = 0
total_fpr = 0
classified_total = 0

for i in range(1, 8):
    cur_positive, cur_negative = make_sample(i)
    cur_x_test, cur_y_test = make_tests(i)
    cur_y_pred = algorithm_2(cur_positive, cur_negative, cur_x_test, threshold=0.53, num_misses=0, min_supp_part = 0.0012)
    cur_y_pred, cur_y_test, classified = clear_objects(cur_y_pred, cur_y_test)
    cur_acc = accuracy_score(cur_y_test, cur_y_pred)
    
    cur_prec = precision_score(cur_y_test, cur_y_pred)
    cur_rec = recall_score(cur_y_test, cur_y_pred)
    cur_f1 = f1_score(cur_y_test, cur_y_pred)
    tn, fp, fn, tp = confusion_matrix(cur_y_test, cur_y_pred).ravel()
    
    total_acc += cur_acc
    total_precision += cur_prec
    total_recall += cur_rec
    total_f1 += cur_f1
    total_tpr += (tp / (tp + fn))
    total_fpr += (fp / (fp + tn))
    classified_total += classified
    
print('-------------')
print('total accuracy = ', total_acc / 7)
print('total precision = ', total_precision / 7)
print('total recall = ', total_recall / 7)
print('total F1 = ', total_f1 / 7)
print('total TPR = ', total_tpr / 7)
print('total FPR = ', total_fpr / 7)
print('classified = ', classified_total / 7)

-------------
total accuracy =  0.9959372720578872
total precision =  0.9881640268964212
total recall =  0.9978354978354977
total F1 =  0.9928925083507588
total TPR =  0.9978354978354977
total FPR =  0.004872215389742555
classified =  0.996525083816483


### Итоговый алгоритм многоклассовой классификации

Это алгоритм № 5.

In [376]:
total_acc = 0
classified_total = 0

for i in range(1, 8):
    unacc, acc, good, vgood = make_complex_sample(i)
    cur_x_test, cur_y_test = make_tests_complex(i)
    cur_y_pred = algorithm_5(unacc, acc, good, vgood, cur_x_test, threshold=0.55, num_misses=0, min_supp=3)
    clear_y_pred, clear_y_test, classified = clear_objects(cur_y_pred, cur_y_test)
    cur_acc = accuracy_score(clear_y_test, clear_y_pred)
    
    total_acc += cur_acc
    classified_total += classified
    
print('-------------')
print('total accuracy = ', total_acc / 7)
print('classified = ', classified_total / 7)

-------------
total accuracy =  0.9836441825469822
classified =  0.9907343405417859


### Стандартные алгоритмы

**1. Knn**


In [392]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

cars_preproc = car_data.apply(preprocessing.LabelEncoder().fit_transform)
cars_preproc = np.array(cars_preproc)
X_train, X_test, y_train_0, y_test_0 = train_test_split(cars_preproc[:, :-1], cars_preproc[:, -1], test_size=0.2, random_state=42)

In [411]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=8) 
knn_classifier.fit(np.array(X_train), np.array(y_train))
y_pred = knn_classifier.predict(X_test)

knn_acc = accuracy_score(y_test, y_pred)
#knn_precision = precision_score(y_test, y_pred, average='micro')
#knn_recall = recall_score(y_test, y_pred, average='micro')
#knn_f1 = f1_score(y_test, y_pred, average='micro')

print("KNN metrics:")
print('accuracy: ', knn_acc)

KNN metrics:
accuracy:  0.9335260115606936


**2. SVM**

In [412]:
from sklearn.svm import SVC

svm_1 = SVC(gamma = 0.45)
svm_1.fit(X_train, y_train)
y_pred_svm = svm_1.predict(X_test)

svm_acc = accuracy_score(y_test[0:10000], y_pred_svm)

print("SVM metrics:")
print('accuracy: ', svm_acc)

SVM metrics:
accuracy:  0.953757225433526


**3. Decision Tree**

In [413]:
from sklearn.tree import DecisionTreeClassifier

tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X_train, y_train)
y_pred_tree = tree_classifier.predict(X_test)

tree_acc = accuracy_score(y_test, y_pred_tree)

print("Decision Tree metrics:")
print('accuracy: ', tree_acc)

Decision Tree metrics:
accuracy:  0.9682080924855492


**4. Random Forest**

In [417]:
from sklearn.ensemble import RandomForestClassifier

forest_classifier = RandomForestClassifier(n_estimators=100)
forest_classifier.fit(X_train, y_train)
y_pred_forest = forest_classifier.predict(X_test)

forest_acc = accuracy_score(y_test, y_pred_forest)
print("Random Forest metrics:")
print('accuracy: ', forest_acc)

Random Forest metrics:
accuracy:  0.9710982658959537


### Дополнение: вероятностное приближение

Следующий алгоритм представляет собой модификацию алгоритма 5. Здесь изменен подсчет поддержки (используется вероятностное приближение): поддержки подсчитываются на случайной подвыборке. 

Параметр prob -- это число (от $0$ до $1$), показывающее на какой части всей выборки мы будем искать поддержки.

In [40]:
def count_supp_prob(intersec, big_set, subsample_size):
    res = 0
    rand_part = np.random.choice(big_set, subsample_size, replace=False)
    for one_elem in rand_part:
        if one_elem.issuperset(intersec):
            res += 1
    return res


def algorithm_6(unacc_d, acc_d, good_d, vgood_d, x_test, threshold = 0, num_misses = 0, min_supp = 0, prob = 1):
    y_pred = []
    subsample_size_0 = int(prob * len(unacc_d))
    subsample_size_1 = int(prob * len(acc_d))
    subsample_size_2 = int(prob * len(good_d))
    subsample_size_3 = int(prob * len(vgood_d))
    
    for i in range(len(x_test)):
        elem = np.array(x_test[i])
              
        voices_0 = 0
        voices_1 = 0
        voices_2 = 0
        voices_3 = 0
        
        for elem_0 in unacc_d:
            elem_0 = np.array(elem_0)
            cur_intersec = elem_0 & elem 
            
            misses = 0
                        
            for other_elem in acc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            for other_elem in good_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
                        
            for other_elem in vgood_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
                        
            if (misses <= num_misses):
                if (count_supp_prob(cur_intersec, unacc_d, subsample_size_0) > min_supp):
                    voices_0 += float(len(cur_intersec) / len(unacc_d[0]))
        
        for elem_1 in acc_d:
            elem_1 = np.array(elem_1)
            cur_intersec = elem_1 & elem 
            
            misses = 0
                        
            for other_elem in unacc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            for other_elem in good_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
                        
            for other_elem in vgood_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break

            if (misses <= num_misses):
                if (count_supp_prob(cur_intersec, acc_d, subsample_size_1) > min_supp):
                    voices_1 += float(len(cur_intersec) / len(acc_d[0]))
        
        for elem_2 in good_d:
            elem_2 = np.array(elem_2)
            cur_intersec = elem_2 & elem 
            
            misses = 0
                        
            for other_elem in unacc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            for other_elem in acc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
                        
            for other_elem in vgood_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break

            if (misses <= num_misses):
                if (count_supp_prob(cur_intersec, good_d, subsample_size_2) > min_supp):
                    voices_2 += float(len(cur_intersec) / len(good_d[0]))
        
        for elem_3 in vgood_d:
            elem_3 = np.array(elem_3)
            cur_intersec = elem_3 & elem 
            
            misses = 0
                        
            for other_elem in unacc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
            
            for other_elem in acc_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break
                        
            for other_elem in good_d:
                if (is_elem_superset(cur_intersec, other_elem)):
                    misses += 1
                    if (misses > num_misses):
                        break

            if (misses <= num_misses):
                if (count_supp_prob(cur_intersec, vgood_d, subsample_size_3) > min_supp):
                    voices_3 += float(len(cur_intersec) / len(vgood_d[0]))
                
        
        voices_0 /= len(unacc_d)
        voices_1 /= len(acc_d)
        voices_2 /= len(good_d)
        voices_3 /= len(vgood_d)
        
        voices = [voices_0, voices_1, voices_2, voices_3]
        
        res_voice = np.argmax(voices)
        w_sum = np.sum(voices)
        
        if (voices[res_voice] / w_sum >= threshold):
            y_pred.append(res_voice)
        else:
            y_pred.append('unknown')
        
    y_pred = np.array(y_pred)
    return y_pred

In [41]:
total_acc = 0
classified_total = 0

for i in range(1, 8):
    unacc, acc, good, vgood = make_complex_sample(i)
    cur_x_test, cur_y_test = make_tests_complex(i)
    cur_y_pred = algorithm_6(unacc, acc, good, vgood, cur_x_test, threshold=0.55, num_misses=0, min_supp=3, prob=0.4)
    clear_y_pred, clear_y_test, classified = clear_objects(cur_y_pred, cur_y_test)
    cur_acc = accuracy_score(clear_y_test, clear_y_pred)
    
    total_acc += cur_acc
    classified_total += classified
    
print('-------------')
print('total accuracy = ', total_acc / 7)
print('classified = ', classified_total / 7)

-------------
total accuracy =  0.9840531329939871
classified =  0.9820588055504614


Уточнение параметров модели:

In [48]:
total_acc = 0
classified_total = 0

for i in range(1, 8):
    unacc, acc, good, vgood = make_complex_sample(i)
    cur_x_test, cur_y_test = make_tests_complex(i)
    cur_y_pred = algorithm_6(unacc, acc, good, vgood, cur_x_test, threshold=0.54, num_misses=0, min_supp=3, prob=0.4)
    clear_y_pred, clear_y_test, classified = clear_objects(cur_y_pred, cur_y_test)
    cur_acc = accuracy_score(clear_y_test, clear_y_pred)
    
    total_acc += cur_acc
    classified_total += classified
    
print('-------------')
print('total accuracy = ', total_acc / 7)
print('classified = ', classified_total / 7)

-------------
total accuracy =  0.98056712404861
classified =  0.984948299454076
