In [1]:
attr_names = []
with open("attrs.txt", 'r', encoding='utf8') as attrs_names:
    for line in attrs_names.readlines():
        line = line[0:-2]
        attr_names.append((line.split('.')[1]).strip())

In [2]:
pred_result = {
    "true_positive": 0,
    "true_negative": 0,
    "false_positive": 0,
    "false_negative": 0,
    "undefined": 0
}

In [3]:
def name_attrs(instance):
    return set([i + ': ' + str(k) for i, k in zip(attr_names, instance)])

In [4]:
def write_result(pos_votes, neg_votes, instance):
    if pos_votes > neg_votes:
        if instance[-1] == "1":
            pred_result["true_positive"] += 1
            return
        else:
            pred_result["false_positive"] += 1
            return
    elif pos_votes < neg_votes:
        if instance[-1] == "0":
            pred_result["true_negative"] += 1
            return
        else:
            pred_result["false_negative"] += 1
            return
    else:
        pred_result["undefined"] += 1
        return

In [5]:
def predict(param, train_pos, train_neg, instance, algorithm="First"):
    if algorithm == "First":
        pos_votes, neg_votes = votes_counter_first(param, train_pos, train_neg, instance)
    elif algorithm == "Second":
        pos_votes, neg_votes = votes_counter_second(param, train_pos, train_neg, instance)
    elif algorithm == "Third":
        pos_votes, neg_votes = votes_counter_third(param, train_pos, train_neg, instance)
    else:
        pos_votes, neg_votes = votes_counter_patt(param, train_pos, train_neg, instance)
    pos_votes = pos_votes / float(len(train_pos))
    neg_votes = neg_votes / float(len(train_neg))
    write_result(pos_votes, neg_votes, instance)

In [6]:
def votes_counter_first(param, train_pos, train_neg, instance):
    named_instance = name_attrs(instance)
    pos_votes = 0
    neg_votes = 0
    for positive in train_pos:
        named_positive = name_attrs(positive)
        if len(named_positive.intersection(named_instance)) / float(len(named_instance)) >= round(param, 3):
            pos_votes += 1
    for negative in train_neg:
        named_negative = name_attrs(negative)
        if len(named_negative.intersection(named_instance)) / float(len(named_instance)) >= round(param, 3):
            neg_votes += 1
    return pos_votes, neg_votes

In [7]:
def votes_counter_second(param, train_pos, train_neg, instance):
    named_instance = name_attrs(instance)
    pos_votes = 0
    neg_votes = 0
    for positive in train_pos:
        named_positive = name_attrs(positive)
        intersec = named_instance.intersection(named_positive)
        intersec_counter = 0
        for each in train_neg:
            if intersec.issubset(name_attrs(each)):
                intersec_counter += 1
        if intersec_counter / float(len(train_neg)) <= param:
            pos_votes += 1

    for negative in train_neg:
        named_negative = name_attrs(negative)
        intersec = named_instance.intersection(named_negative)
        intersec_counter = 0
        for each in train_pos:
            if intersec.issubset(name_attrs(each)):
                intersec_counter += 1
        if intersec_counter / float(len(train_pos)) <= param:
            neg_votes += 1
    return pos_votes, neg_votes

In [8]:
def votes_counter_third(param, train_pos, train_neg, instance):
    named_instance = name_attrs(instance)
    biggest_positive = [0] * param
    biggest_negative = [0] * param
    for positive in train_pos:
        named_positive = name_attrs(positive)
        smallest_in_list = min(biggest_positive)
        current_intersection = len(named_instance.intersection(named_positive))
        if smallest_in_list < current_intersection:
            biggest_positive.remove(smallest_in_list)
            biggest_positive.append(current_intersection)
    for negative in train_neg:
        named_negative = name_attrs(negative)
        smallest_in_list = min(biggest_negative)
        current_intersection = len(named_instance.intersection(named_negative))
        if smallest_in_list < current_intersection:
            biggest_negative.remove(smallest_in_list)
            biggest_negative.append(current_intersection)
    biggest_negative.sort(reverse=True)
    biggest_positive.sort(reverse=True)

    for neg, pos in zip(biggest_negative, biggest_positive):
        if neg > pos:
            neg_votes = 1
            pos_votes = 0
            return pos_votes, neg_votes
        elif pos > neg:
            pos_votes = 1
            neg_votes = 0
            return pos_votes, neg_votes
    return 0, 0

In [9]:
def split_file(file_name, blocks_num):
    with open(file_name, "r") as f:
        data = [a.strip().split(";") for a in f][1:]

    for k in range(blocks_num):
        training = [x for i, x in enumerate(data) if i % blocks_num != k]
        validation = [x for i, x in enumerate(data) if i % blocks_num == k]
        yield training, validation

In [10]:
def cross_valid_check(param, file_name, blocks_num=10, algorithm="First"):
    acc = prec = FDR = FPR = NPV = TPR = TNR = rec = F1_score = 0
    split = split_file(file_name, blocks_num)
    for iteration in range(blocks_num):
        train, to_predict = next(split)
        train_pos = [a for a in train if a[-1] == "1"]
        train_neg = [a for a in train if a[-1] == "0"]
        for sample in to_predict:
            predict(param, train_pos, train_neg, sample, algorithm)
        acc += my_division(pred_result["true_positive"] + pred_result["true_negative"], len(to_predict))
        prec += my_division(pred_result["true_positive"], pred_result["true_positive"] + pred_result["false_positive"])
        FDR += my_division(pred_result["false_positive"], pred_result["false_positive"] + pred_result["true_positive"])
        FPR += my_division(pred_result["false_positive"], pred_result["true_negative"] + pred_result["false_positive"])
        NPV += my_division(pred_result["true_negative"], pred_result["true_negative"] + pred_result["false_negative"])
        TPR += my_division(pred_result["true_positive"], pred_result["true_positive"] + pred_result["false_negative"])
        TNR += my_division(pred_result["true_negative"], pred_result["false_positive"] + pred_result["true_negative"])
        rec += my_division(pred_result["true_positive"], pred_result["true_positive"] + pred_result["false_negative"])
        F1_score += my_division(2*pred_result["true_positive"], 
                                2*pred_result["true_positive"] + pred_result["false_positive"] + pred_result["false_negative"])
        for keys in pred_result:
            pred_result[keys] = 0

    print("True Positive Rate: {}\nTrue Negative Rate: {}\n\
Negative Predictive Value: {}\nFalse Positive Rate: {}\nFalse Discovery Rate: {}\nAccuracyPrecision: {}\n\
Recall: {}\nPrecision: {}\nF1 score: {}".format(TPR / float(blocks_num), TNR / float(blocks_num), NPV / float(blocks_num), 
                                  FPR / float(blocks_num), FDR / float(blocks_num), acc / float(blocks_num), 
                                  rec / float(blocks_num), prec / float(blocks_num), F1_score / float(blocks_num)))
    print("Parametr: {}".format(param))
    print("//--------------------------------------")
    return acc / float(blocks_num)

In [11]:
def cross_valid_executor(file_name, blocks_num=10, algorithm="First"):
    coeff = 0.5 if algorithm != 'First' else 0.2
    param = 0.1
    best_accuracy = 0
    for i in range(10):
        if algorithm == "Second":
            coeff -= 0.05
        elif algorithm == 'First':
            coeff -=0.02
        else:
            coeff = 2*(i + 1)
        accuracy = cross_valid_check(coeff, file_name, blocks_num=blocks_num, algorithm=algorithm)
        if accuracy > best_accuracy:
            param = coeff
            best_accuracy = accuracy
    return best_accuracy, param

In [12]:
def my_division(x, y):
    if y == 0:
        return 0
    else:
        return x / float(y)

In [13]:
def get_result(input_length):
    print(pred_result)
    acc = my_division(pred_result["true_positive"] + pred_result["true_negative"], input_length)
    prec = my_division(pred_result["true_positive"], pred_result["true_positive"] + pred_result["false_positive"])
    FDR = my_division(pred_result["false_positive"], pred_result["false_positive"] + pred_result["true_positive"])
    FPR = my_division(pred_result["false_positive"], pred_result["true_negative"] + pred_result["false_positive"])
    NPV = my_division(pred_result["true_negative"], pred_result["true_negative"] + pred_result["false_negative"])
    TPR = my_division(pred_result["true_positive"], pred_result["true_positive"] + pred_result["false_negative"])
    TNR = my_division(pred_result["true_negative"], pred_result["false_positive"] + pred_result["true_negative"])
    rec = my_division(pred_result["true_positive"], pred_result["true_positive"] + pred_result["false_negative"])
    print("True Positive Rate: {}\nTrue Negative Rate: {}\n\
Negative Predictive Value: {}\nFalse Positive Rate: {}\nFalse Discovery Rate: {}\nAccuracyPrecision: {}\n\
Recall: {}\nPrecision: {}".format(TPR, TNR, NPV, FPR, FDR, acc, rec, prec))

In [14]:
data_name = "divorce.csv"
best_accuracy, param = cross_valid_executor(data_name, blocks_num=3, algorithm='First')
print("Accuracy = ", best_accuracy)
print("Param = ", param)

True Positive Rate: 0.9523809523809524
True Negative Rate: 1.0
Negative Predictive Value: 0.9558892596712396
False Positive Rate: 0.0
False Discovery Rate: 0.0
AccuracyPrecision: 0.9765037593984962
Recall: 0.9523809523809524
Precision: 1.0
F1 score: 0.9755331088664422
Parametr: 0.18000000000000002
//--------------------------------------
True Positive Rate: 0.9523809523809524
True Negative Rate: 1.0
Negative Predictive Value: 0.9558892596712396
False Positive Rate: 0.0
False Discovery Rate: 0.0
AccuracyPrecision: 0.9765037593984962
Recall: 0.9523809523809524
Precision: 1.0
F1 score: 0.9755331088664422
Parametr: 0.16000000000000003
//--------------------------------------
True Positive Rate: 0.9523809523809524
True Negative Rate: 1.0
Negative Predictive Value: 0.9558892596712396
False Positive Rate: 0.0
False Discovery Rate: 0.0
AccuracyPrecision: 0.9765037593984962
Recall: 0.9523809523809524
Precision: 1.0
F1 score: 0.9755331088664422
Parametr: 0.14000000000000004
//-------------------

In [15]:
data_name = "divorce.csv"
best_accuracy, param = cross_valid_executor(data_name, blocks_num=3, algorithm='Second')
print("Accuracy = ", best_accuracy)
print("Param = ", param)

True Positive Rate: 0.7538992408557625
True Negative Rate: 1.0
Negative Predictive Value: 0.8196649029982362
False Positive Rate: 0.0
False Discovery Rate: 0.0
AccuracyPrecision: 0.6999791144527986
Recall: 0.7538992408557625
Precision: 1.0
F1 score: 0.8576998050682262
Parametr: 0.45
//--------------------------------------
True Positive Rate: 0.7538992408557625
True Negative Rate: 1.0
Negative Predictive Value: 0.8214285714285715
False Positive Rate: 0.0
False Discovery Rate: 0.0
AccuracyPrecision: 0.705827067669173
Recall: 0.7538992408557625
Precision: 1.0
F1 score: 0.8576998050682262
Parametr: 0.4
//--------------------------------------
True Positive Rate: 0.8826190476190477
True Negative Rate: 1.0
Negative Predictive Value: 0.902010582010582
False Positive Rate: 0.0
False Discovery Rate: 0.0
AccuracyPrecision: 0.7885338345864662
Recall: 0.8826190476190477
Precision: 1.0
F1 score: 0.9365434863755132
Parametr: 0.35000000000000003
//--------------------------------------
True Positive

In [16]:
data_name = "divorce.csv"
best_accuracy, param = cross_valid_executor(data_name, blocks_num=3, algorithm='Third')
print("Accuracy = ", best_accuracy)
print("Param = ", param)

True Positive Rate: 0.9523809523809524
True Negative Rate: 1.0
Negative Predictive Value: 0.9558892596712396
False Positive Rate: 0.0
False Discovery Rate: 0.0
AccuracyPrecision: 0.9765037593984962
Recall: 0.9523809523809524
Precision: 1.0
F1 score: 0.9755331088664422
Parametr: 2
//--------------------------------------
True Positive Rate: 0.9523809523809524
True Negative Rate: 1.0
Negative Predictive Value: 0.9558892596712396
False Positive Rate: 0.0
False Discovery Rate: 0.0
AccuracyPrecision: 0.9765037593984962
Recall: 0.9523809523809524
Precision: 1.0
F1 score: 0.9755331088664422
Parametr: 4
//--------------------------------------
True Positive Rate: 0.9523809523809524
True Negative Rate: 1.0
Negative Predictive Value: 0.9558892596712396
False Positive Rate: 0.0
False Discovery Rate: 0.0
AccuracyPrecision: 0.9765037593984962
Recall: 0.9523809523809524
Precision: 1.0
F1 score: 0.9755331088664422
Parametr: 6
//--------------------------------------
True Positive Rate: 0.952380952380

## Узорные структуры

In [19]:
def make_intervals(a):
    return list(map(lambda x: (x, x), a))

In [20]:
def similarity(a, b):
    result = []
    for i in range(len(a)):
        result.append((min(a[i][0], b[i][0]), max(a[i][1], b[i][1])))  
    return result

In [21]:
# тут a это элемент из тренировочного набора
# b это будут similarity нашего ногового объекта с каким-то тренировочным
def subsumption(a, b): # a >= b (b is subsumed by a)
    for i in range(len(a)):
        if a[i][0] < b[i][0] or a[i][1] > b[i][1]:
            return False
    return True

In [22]:
# тут a это similarity
def prime_for_attr(a, train_pos, train_neg):
    result = {'Positive objects': 0, 'Negative objects': 0}
    for i in train_pos:
        if subsumption(i, a):
            result['Positive objects'] += 1
    for i in train_neg:
        if subsumption(i, a):
            result['Negative objects'] += 1
    return result

In [23]:
def dist(d):
    return sum(map(lambda x: x[1] - x[0], d)) / (54 * 4)

In [24]:
def cross_valid_executor_patt(file_name, blocks_num=10):
    coeff = 0.7
    param = 0.1
    best_accuracy = 0
    for i in range(10):
        coeff -= 0.05
        accuracy = cross_valid_check_patt(coeff, file_name, blocks_num)
        if accuracy > best_accuracy:
            param = coeff
            best_accuracy = accuracy
    return best_accuracy, param

In [25]:
def votes_counter_patt(param, train_pos, train_neg, new_record):
    pos_votes = 0
    neg_votes = 0
    for i in range(len(train_pos)):
        d = similarity(train_pos[i], new_record)
        res = prime_for_attr(d, train_pos, train_neg)
        if res['Negative objects'] == 0 and dist(d) < param:
            pos_votes += 1
    for i in range(len(train_neg)):
        d = similarity(train_neg[i], new_record)
        res = prime_for_attr(d, train_pos, train_neg)
        if res['Positive objects'] == 0 and dist(d) < param:
            neg_votes += 1
    return pos_votes, neg_votes

In [26]:
def cross_valid_check_patt(param, data_name, blocks_num):
    acc = prec = FDR = FPR = NPV = TPR = TNR = rec = F1_score = 0
    split = split_file(data_name, blocks_num)
    for _ in range(blocks_num):
        train, to_predict = next(split)
        train_pos = [list(map(lambda x: int(x), a[:-1])) for a in train if a[-1] == "1"]
        train_neg = [list(map(lambda x: int(x), a[:-1])) for a in train if a[-1] == "0"]
        to_predict = [list(map(lambda x: int(x), a)) for a in to_predict]
        for i in range(len(train_pos)):
            train_pos[i] = make_intervals(train_pos[i])
        for i in range(len(train_neg)):
            train_neg[i] = make_intervals(train_neg[i])
        for i in range(len(to_predict)):
            a = to_predict[i][-1]
            to_predict[i] = make_intervals(to_predict[i][:-1])
            to_predict[i].append(str(a))
        for sample in to_predict:
            predict(param, train_pos, train_neg, sample, 'Pattern Structure')
        acc += my_division(pred_result["true_positive"] + pred_result["true_negative"], len(to_predict))
        prec += my_division(pred_result["true_positive"], pred_result["true_positive"] + pred_result["false_positive"])
        FDR += my_division(pred_result["false_positive"], pred_result["false_positive"] + pred_result["true_positive"])
        FPR += my_division(pred_result["false_positive"], pred_result["true_negative"] + pred_result["false_positive"])
        NPV += my_division(pred_result["true_negative"], pred_result["true_negative"] + pred_result["false_negative"])
        TPR += my_division(pred_result["true_positive"], pred_result["true_positive"] + pred_result["false_negative"])
        TNR += my_division(pred_result["true_negative"], pred_result["false_positive"] + pred_result["true_negative"])
        rec += my_division(pred_result["true_positive"], pred_result["true_positive"] + pred_result["false_negative"])
        F1_score += my_division(2*pred_result["true_positive"], 
                             2*pred_result["true_positive"] + pred_result["false_positive"] + pred_result["false_negative"])
        for keys in pred_result:
            pred_result[keys] = 0
    print("True Positive Rate: {}\nTrue Negative Rate: {}\n\
Negative Predictive Value: {}\nFalse Positive Rate: {}\nFalse Discovery Rate: {}\nAccuracy: {}\n\
Recall: {}\nPrecision: {}\nF1 score: {}".format(TPR / float(blocks_num), TNR / float(blocks_num), NPV / float(blocks_num), 
                                  FPR / float(blocks_num), FDR / float(blocks_num), acc / float(blocks_num), 
                                  rec / float(blocks_num), prec / float(blocks_num), F1_score / float(blocks_num)))
    print("Parametr: {}".format(param))
    print("//--------------------------------------")
    return acc / float(blocks_num)

In [27]:
data_name = "divorce.csv"
best_accuracy, param = cross_valid_executor_patt(data_name, blocks_num=3)
print("Accuracy = ", best_accuracy)
print("Param = ", param)

True Positive Rate: 0.958888888888889
True Negative Rate: 1.0
Negative Predictive Value: 0.9662835249042145
False Positive Rate: 0.0
False Discovery Rate: 0.0
Accuracy: 0.9176065162907268
Recall: 0.958888888888889
Precision: 1.0
F1 score: 0.979012881748444
Parametr: 0.6499999999999999
//--------------------------------------
True Positive Rate: 0.9485992402659069
True Negative Rate: 1.0
Negative Predictive Value: 0.9558892596712396
False Positive Rate: 0.0
False Discovery Rate: 0.0
Accuracy: 0.9351503759398496
Recall: 0.9485992402659069
Precision: 1.0
F1 score: 0.9735513408855087
Parametr: 0.5999999999999999
//--------------------------------------
True Positive Rate: 0.9510242843576178
True Negative Rate: 1.0
Negative Predictive Value: 0.9558892596712396
False Positive Rate: 0.0
False Discovery Rate: 0.0
Accuracy: 0.95875104427736
Recall: 0.9510242843576178
Precision: 1.0
F1 score: 0.9748290650991355
Parametr: 0.5499999999999998
//--------------------------------------
True Positive R