In [57]:
%matplotlib inline

import pandas as pd
pd.set_option("display.max_rows", 100, "display.max_columns", 100) 
import numpy as np
import plotly as pt
import seaborn as sns
import matplotlib.pyplot as plt


train = pd.read_csv("./train.csv")
test =  pd.read_csv("./test.csv")

Сначала приведем датасет в порядок. Отбросим ненужные колонки и избавимся от пропусков.

In [58]:
train = train.drop(columns=['Cabin','Ticket', 'Name', 'PassengerId'])
test = test.drop(columns=['Cabin','Ticket', 'PassengerId'])
train = train.dropna()
test = test.dropna()

In [59]:
train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
889,1,1,male,26.0,0,0,30.0000,C


In [60]:
test

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0000,S
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q
3,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S
...,...,...,...,...,...,...,...,...
409,3,"Peacock, Miss. Treasteall",female,3.0,1,1,13.7750,S
411,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37.0,1,0,90.0000,Q
412,3,"Henriksson, Miss. Jenny Lovisa",female,28.0,0,0,7.7750,S
414,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,108.9000,C


Теперь займемся бинаризацией данных. Для этого разберемся, какие значения присетствуют в датасете.

In [61]:
train.describe()

# Survival - Survival (0 = No; 1 = Yes). Not included in test.csv file.
# Pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
# Name - Name
# Sex - Sex
# Age - Age
# Sibsp - Number of Siblings/Spouses Aboard
# Parch - Number of Parents/Children Aboard
# Ticket - Ticket Number
# Fare - Passenger Fare
# Cabin - Cabin
# Embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,712.0,712.0,712.0,712.0,712.0,712.0
mean,0.404494,2.240169,29.642093,0.514045,0.432584,34.567251
std,0.491139,0.836854,14.492933,0.930692,0.854181,52.938648
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,1.0,20.0,0.0,0.0,8.05
50%,0.0,2.0,28.0,0.0,0.0,15.64585
75%,1.0,3.0,38.0,1.0,1.0,33.0
max,1.0,3.0,80.0,5.0,6.0,512.3292


In [62]:
import plotly.express as px

fares = sorted(list(train['Fare']))
fig = px.bar(fares)
fig.show()

In [63]:
ages = sorted(list(train['Age']))
fig2 = px.bar(ages)
fig2.show()

In [64]:
sibSp = sorted(list(train['SibSp']))
fig3 = px.bar(sibSp)
fig3.show()

In [65]:
parch = sorted(list(train['Parch']))
fig4 = px.bar(parch)
fig4.show()

In [66]:
train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
889,1,1,male,26.0,0,0,30.0000,C


In [67]:
train['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

Визуализовав данные, стало примерно понятно, как их кодировать. Займемся этим.

In [68]:
bin_train = train.copy()

bin_train['Pclass_1'] = np.where(bin_train['Pclass'] == 1, 1, 0)
bin_train['Pclass_2'] = np.where(bin_train['Pclass'] == 2, 1, 0)
bin_train['Pclass_3'] = np.where(bin_train['Pclass'] == 3, 1, 0)
bin_train = bin_train.drop(columns=['Pclass'])

bin_train['Sex_M'] = np.where(bin_train['Sex'] == 'male', 1, 0)
bin_train['Sex_F'] = np.where(bin_train['Sex'] == 'female', 1, 0)
bin_train = bin_train.drop(columns=['Sex'])

bin_train['Age_Baby'] = np.where(bin_train['Age'] < 10., 1, 0)
bin_train['Age_Young'] = np.where((bin_train['Age'] >= 10.) & (bin_train['Age'] < 18.), 1, 0)
bin_train['Age_Young_Adult'] = np.where((bin_train['Age'] >= 18.) & (bin_train['Age'] < 27.), 1, 0)
bin_train['Age_Adult'] = np.where((bin_train['Age'] > 27.) & (bin_train['Age'] < 40.), 1, 0)
bin_train['Age_Elder_Adult'] = np.where((bin_train['Age'] >= 40.) & (bin_train['Age'] < 50.), 1, 0)
bin_train['Age_Old'] = np.where(bin_train['Age'] >=50., 1, 0)
bin_train = bin_train.drop(columns=['Age'])

bin_train['SibSp_1'] = np.where(bin_train['SibSp'] == 1, 1, 0)
bin_train['SibSp_2'] = np.where(bin_train['SibSp'] == 2, 1, 0)
bin_train['SibSp_>2'] = np.where(bin_train['SibSp'] > 2, 1, 0)
bin_train = bin_train.drop(columns=['SibSp'])

bin_train['Parch_1'] = np.where(bin_train['Parch'] == 1, 1, 0)
bin_train['Parch_2'] = np.where(bin_train['Parch'] == 2, 1, 0)
bin_train['Parch_>2'] = np.where(bin_train['Parch'] > 2, 1, 0)
bin_train = bin_train.drop(columns=['Parch'])

bin_train['Fare_super_small'] = np.where(bin_train['Fare'] < 11., 1, 0)
bin_train['Fare_small'] = np.where((bin_train['Fare'] >= 11.) & (bin_train['Fare'] < 18.), 1, 0)
bin_train['Fare_medium'] = np.where((bin_train['Fare'] >= 18.) & (bin_train['Fare'] < 34.), 1, 0)
bin_train['Fare_large'] = np.where((bin_train['Fare'] >= 34.) & (bin_train['Fare'] < 95.), 1, 0)
bin_train['Fare_ultra_large'] = np.where(bin_train['Fare'] >=95., 1, 0)
bin_train = bin_train.drop(columns=['Fare'])

bin_train['Embarked_S'] = np.where(bin_train['Embarked'] == 'S', 1, 0)
bin_train['Embarked_C'] = np.where(bin_train['Embarked'] == 'C', 1, 0)
bin_train['Embarked_Q'] = np.where(bin_train['Embarked'] == 'Q', 1, 0)
bin_train = bin_train.drop(columns=['Embarked'])

bin_train

Unnamed: 0,Survived,Pclass_1,Pclass_2,Pclass_3,Sex_M,Sex_F,Age_Baby,Age_Young,Age_Young_Adult,Age_Adult,Age_Elder_Adult,Age_Old,SibSp_1,SibSp_2,SibSp_>2,Parch_1,Parch_2,Parch_>2,Fare_super_small,Fare_small,Fare_medium,Fare_large,Fare_ultra_large,Embarked_S,Embarked_C,Embarked_Q
0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0
1,1,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0
2,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
3,1,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
4,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1
886,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
887,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
889,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [69]:
bin_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Survived          712 non-null    int64
 1   Pclass_1          712 non-null    int32
 2   Pclass_2          712 non-null    int32
 3   Pclass_3          712 non-null    int32
 4   Sex_M             712 non-null    int32
 5   Sex_F             712 non-null    int32
 6   Age_Baby          712 non-null    int32
 7   Age_Young         712 non-null    int32
 8   Age_Young_Adult   712 non-null    int32
 9   Age_Adult         712 non-null    int32
 10  Age_Elder_Adult   712 non-null    int32
 11  Age_Old           712 non-null    int32
 12  SibSp_1           712 non-null    int32
 13  SibSp_2           712 non-null    int32
 14  SibSp_>2          712 non-null    int32
 15  Parch_1           712 non-null    int32
 16  Parch_2           712 non-null    int32
 17  Parch_>2          712 non-null    i

In [70]:
bin_test = test.copy()

bin_test['Pclass_1'] = np.where(bin_test['Pclass'] == 1, 1, 0)
bin_test['Pclass_2'] = np.where(bin_test['Pclass'] == 2, 1, 0)
bin_test['Pclass_3'] = np.where(bin_test['Pclass'] == 3, 1, 0)
bin_test = bin_test.drop(columns=['Pclass'])

bin_test['Sex_M'] = np.where(bin_test['Sex'] == 'male', 1, 0)
bin_test['Sex_F'] = np.where(bin_test['Sex'] == 'female', 1, 0)
bin_test = bin_test.drop(columns=['Sex'])

bin_test['Age_Baby'] = np.where(bin_test['Age'] < 10., 1, 0)
bin_test['Age_Young'] = np.where((bin_test['Age'] >= 10.) & (bin_test['Age'] < 18.), 1, 0)
bin_test['Age_Young_Adult'] = np.where((bin_test['Age'] >= 18.) & (bin_test['Age'] < 27.), 1, 0)
bin_test['Age_Adult'] = np.where((bin_test['Age'] > 27.) & (bin_test['Age'] < 40.), 1, 0)
bin_test['Age_Elder_Adult'] = np.where((bin_test['Age'] >= 40.) & (bin_test['Age'] < 50.), 1, 0)
bin_test['Age_Old'] = np.where(bin_test['Age'] >=50., 1, 0)
bin_test = bin_test.drop(columns=['Age'])

bin_test['SibSp_1'] = np.where(bin_test['SibSp'] == 1, 1, 0)
bin_test['SibSp_2'] = np.where(bin_test['SibSp'] == 2, 1, 0)
bin_test['SibSp_>2'] = np.where(bin_test['SibSp'] > 2, 1, 0)
bin_test = bin_test.drop(columns=['SibSp'])

bin_test['Parch_1'] = np.where(bin_test['Parch'] == 1, 1, 0)
bin_test['Parch_2'] = np.where(bin_test['Parch'] == 2, 1, 0)
bin_test['Parch_>2'] = np.where(bin_test['Parch'] > 2, 1, 0)
bin_test = bin_test.drop(columns=['Parch'])

bin_test['Fare_super_small'] = np.where(bin_test['Fare'] < 11., 1, 0)
bin_test['Fare_small'] = np.where((bin_test['Fare'] >= 11.) & (bin_test['Fare'] < 18.), 1, 0)
bin_test['Fare_medium'] = np.where((bin_test['Fare'] >= 18.) & (bin_test['Fare'] < 34.), 1, 0)
bin_test['Fare_large'] = np.where((bin_test['Fare'] >= 34.) & (bin_test['Fare'] < 95.), 1, 0)
bin_test['Fare_ultra_large'] = np.where(bin_test['Fare'] >=95., 1, 0)
bin_test = bin_test.drop(columns=['Fare'])

bin_test['Embarked_S'] = np.where(bin_test['Embarked'] == 'S', 1, 0)
bin_test['Embarked_C'] = np.where(bin_test['Embarked'] == 'C', 1, 0)
bin_test['Embarked_Q'] = np.where(bin_test['Embarked'] == 'Q', 1, 0)
bin_test = bin_test.drop(columns=['Embarked'])

bin_test

Unnamed: 0,Name,Pclass_1,Pclass_2,Pclass_3,Sex_M,Sex_F,Age_Baby,Age_Young,Age_Young_Adult,Age_Adult,Age_Elder_Adult,Age_Old,SibSp_1,SibSp_2,SibSp_>2,Parch_1,Parch_2,Parch_>2,Fare_super_small,Fare_small,Fare_medium,Fare_large,Fare_ultra_large,Embarked_S,Embarked_C,Embarked_Q
0,"Kelly, Mr. James",0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
1,"Wilkes, Mrs. James (Ellen Needs)",0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0
2,"Myles, Mr. Thomas Francis",0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,"Wirz, Mr. Albert",0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
4,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409,"Peacock, Miss. Treasteall",0,0,1,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0
411,"Minahan, Mrs. William Edward (Lillian E Thorpe)",1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1
412,"Henriksson, Miss. Jenny Lovisa",0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
414,"Oliva y Ocana, Dona. Fermina",1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0


Разделим на + и - контексты и закончим бинаризацию, преобразовав dataframe в строки

In [71]:
pos_context = bin_train[bin_train['Survived'] == 1]
neg_context = bin_train[bin_train['Survived'] == 0]
pos_context = pos_context.drop(columns=['Survived'])
neg_context = neg_context.drop(columns=['Survived'])

print(len(pos_context), len(neg_context))

288 424


In [72]:
cols_with_no_bin_code = pos_context.columns
for context in [pos_context, neg_context]:
    for col in context.columns:
        context[col] = context[col].astype(str)
    context['bin_code'] = ''
    
for col in cols_with_no_bin_code:
    pos_context['bin_code'] += pos_context[col] 
    
for col in cols_with_no_bin_code:
    neg_context['bin_code'] += neg_context[col]

pos_context = pos_context['bin_code']
neg_context = neg_context['bin_code']

In [73]:
pos_context, neg_context

(1      1000100010010000000010010
 2      0010100100000000010000100
 3      1000100010010000000010100
 8      0010100000000001001000100
 9      0100101000010000000100010
                  ...            
 875    0010101000000000010000010
 879    1000100000100010000010010
 880    0100100100000010000100100
 887    1000100100000000000100100
 889    1001000100000000000100010
 Name: bin_code, Length: 288, dtype: object,
 0      0011000100010000010000100
 4      0011000010000000010000100
 6      1001000000100000000010100
 7      0011010000000110000100100
 12     0011000100000000010000100
                  ...            
 883    0101000010000000010000100
 884    0011000100000000010000100
 885    0010100010000000100100001
 886    0101000000000000001000100
 890    0011000010000000010000001
 Name: bin_code, Length: 424, dtype: object)

In [74]:
train_pos = pos_context[:60]
train_neg = neg_context[:60]

Бинаризацию мы закончили, пришло время реализовать сами алгоритмы

In [75]:
def binarize_input(Name):
    person_df = bin_test[bin_test['Name'] == Name]
    context = person_df.copy()
    
    context = context.drop(columns='Name')
    
    cols_with_no_bin_code = context.columns
    for col in context.columns:
        context[col] = context[col].astype(str)
    context['bin_code'] = ''
    for col in cols_with_no_bin_code:
        context['bin_code'] += context[col]
        
    return context['bin_code'].values[0]

In [76]:
binarize_input('Jones, Mr. Charles Cresson'), len(binarize_input('Jones, Mr. Charles Cresson'))

('1001000001000000000100100', 25)

In [77]:
def algo1(Name = '', bin_inpt = '0'):
    if Name != '':
        inpt = binarize_input(Name)
    else:
        inpt = bin_inpt
    Splus = 0
    Sminus = 0
    
    for context in pos_context:
        a = "{0:b}".format(int(inpt, base=2)&int(context, base=2)).count('1') * 1.
        Splus += a  / inpt.count('1')
    
    for context in neg_context:
        a = "{0:b}".format(int(inpt, base=2)&int(context, base=2)).count('1')* 1.
        Sminus += a / inpt.count('1')
        
#     print(Splus, Sminus)
    return Splus > Sminus

Протестируем alg1

In [78]:
res_for_pos= []
res_for_neg= []

train_pos = pos_context[:100]
train_neg = neg_context[:100]

for i in train_pos:
    res_for_pos.append(algo1(bin_inpt = i))

for i in train_neg:
    res_for_neg.append(algo1(bin_inpt = i))
results = {
        "tp": 0,
        "tn": 0,
        "fp": 0,
        "fn": 0
}
results['tp'] = res_for_pos.count(True)
results['tn'] = res_for_neg.count(False)
results['fp'] = res_for_neg.count(True)
results['fn'] = res_for_pos.count(False)
results

{'tp': 27, 'tn': 99, 'fp': 1, 'fn': 73}

In [79]:
def dz(a, b):
    if b == 0:
        return 0
    else:
        return a / b
    
def metrics(results):
    metr = {}
    metr['Accuracy'] = dz(results["tp"] + results["tn"],sum(results.values()))*100
    metr['Precision'] = dz(results["tp"],results["tp"] + results["fp"])*100
    metr['Recall'] = dz(results["tp"],results["tp"] + results["tn"])*100
    metr['TNR'] = dz(results["tn"],results["fp"]+ results["tn"])*100
    metr['NPV'] = dz(results["tn"],results["tn"] + results["fn"])*100
    metr['FPR'] = dz(results["fp"],results["tn"] + results["fp"])*100
    metr['FDR'] = dz(results["fp"],results["tp"] + results["fp"])*100
    return metr
metrics(results)

{'Accuracy': 63.0,
 'Precision': 96.42857142857143,
 'Recall': 21.428571428571427,
 'TNR': 99.0,
 'NPV': 57.55813953488372,
 'FPR': 1.0,
 'FDR': 3.571428571428571}

Применим метод скользящего окна

In [85]:
train_pos = pos_context[:50]
train_neg = neg_context[:50]

for i in train_pos:
    res_for_pos.append(algo1(bin_inpt = i))

for i in train_neg:
    res_for_neg.append(algo1(bin_inpt = i))
results = {
        "tp": 0,
        "tn": 0,
        "fp": 0,
        "fn": 0
}
results['tp'] = res_for_pos.count(True)
results['tn'] = res_for_neg.count(False)
results['fp'] = res_for_neg.count(True)
results['fn'] = res_for_pos.count(False)
metrics(results)

{'Accuracy': 60.0,
 'Precision': 97.61904761904762,
 'Recall': 17.083333333333332,
 'TNR': 99.5,
 'NPV': 55.58659217877096,
 'FPR': 0.5,
 'FDR': 2.380952380952381}

In [86]:
train_pos = pos_context[50:100]
train_neg = neg_context[50:100]

for i in train_pos:
    res_for_pos.append(algo1(bin_inpt = i))

for i in train_neg:
    res_for_neg.append(algo1(bin_inpt = i))
results = {
        "tp": 0,
        "tn": 0,
        "fp": 0,
        "fn": 0
}
results['tp'] = res_for_pos.count(True)
results['tn'] = res_for_neg.count(False)
results['fp'] = res_for_neg.count(True)
results['fn'] = res_for_pos.count(False)
metrics(results)

{'Accuracy': 61.8,
 'Precision': 96.82539682539682,
 'Recall': 19.741100323624593,
 'TNR': 99.2,
 'NPV': 56.75057208237986,
 'FPR': 0.8,
 'FDR': 3.1746031746031744}

In [103]:
# Игтого, методом скользящего окна:
r1 = {'Accuracy': 61.8,
 'Precision': 96.82539682539682,
 'Recall': 19.741100323624593,
 'TNR': 99.2,
 'NPV': 56.75057208237986,
 'FPR': 0.8,
 'FDR': 3.1746031746031744}
r2 = {'Accuracy': 60.0,
 'Precision': 97.61904761904762,
 'Recall': 17.083333333333332,
 'TNR': 99.5,
 'NPV': 55.58659217877096,
 'FPR': 0.5,
 'FDR': 2.380952380952381}
r = {}
r['Accuracy'] = (r1['Accuracy'] + r2['Accuracy']) / 2
r['Precision'] = (r1['Precision'] + r2['Precision']) / 2
r['Recall'] = (r1['Recall'] + r2['Recall']) / 2
r['TNR'] = (r1['TNR'] + r2['TNR']) / 2
r['NPV'] = (r1['NPV'] + r2['NPV']) / 2
r['FPR'] = (r1['FPR'] + r2['FPR']) / 2
r['FDR'] = (r1['FDR'] + r2['FDR']) / 2
r

{'Accuracy': 60.9,
 'Precision': 97.22222222222223,
 'Recall': 18.41221682847896,
 'TNR': 99.35,
 'NPV': 56.16858213057541,
 'FPR': 0.65,
 'FDR': 2.7777777777777777}

В модифицированном алгоритме будем сравнивать по убыванию мощности пересечения негативный и позитивный контексты. На какой больше всего похож ввод - к тому и причислим его.

In [87]:
def algo2(Name = '', bin_inpt = '0'):
    if Name != '':
        inpt = binarize_input(Name)
    else:
        inpt = bin_inpt
    Splus = []
    Sminus = []
    
    for context in pos_context:
        a = "{0:b}".format(int(inpt, base=2)&int(context, base=2)).count('1') * 1.
        Splus.append(a  / inpt.count('1'))
    
    for context in neg_context:
        a = "{0:b}".format(int(inpt, base=2)&int(context, base=2)).count('1') * 1.
        Sminus.append(a  / inpt.count('1'))
    
    for i in range(len(Splus)):
        if Splus[i] > Sminus[i]:
            return True
        elif Splus[i] < Sminus[i]:
            return False

In [88]:
res_for_pos= []
res_for_neg= []

train_pos = pos_context[:100]
train_neg = neg_context[:100]

for i in train_pos:
    res_for_pos.append(algo2(bin_inpt = i))

for i in train_neg:
    res_for_neg.append(algo2(bin_inpt = i))
results = {
        "tp": 0,
        "tn": 0,
        "fp": 0,
        "fn": 0
}

results['tp'] = res_for_pos.count(True)
results['tn'] = res_for_neg.count(False)
results['fp'] = res_for_neg.count(True)
results['fn'] = res_for_pos.count(False)
results
metrics(results)

{'Accuracy': 69.0,
 'Precision': 76.38888888888889,
 'Recall': 39.85507246376812,
 'TNR': 83.0,
 'NPV': 64.84375,
 'FPR': 17.0,
 'FDR': 23.61111111111111}

Метод скользящего окна:

In [90]:
res_for_pos= []
res_for_neg= []

train_pos = pos_context[:50]
train_neg = neg_context[:50]

for i in train_pos:
    res_for_pos.append(algo2(bin_inpt = i))

for i in train_neg:
    res_for_neg.append(algo2(bin_inpt = i))
results = {
        "tp": 0,
        "tn": 0,
        "fp": 0,
        "fn": 0
}

results['tp'] = res_for_pos.count(True)
results['tn'] = res_for_neg.count(False)
results['fp'] = res_for_neg.count(True)
results['fn'] = res_for_pos.count(False)
results
metrics(results)

{'Accuracy': 66.0,
 'Precision': 75.0,
 'Recall': 36.36363636363637,
 'TNR': 84.0,
 'NPV': 61.76470588235294,
 'FPR': 16.0,
 'FDR': 25.0}

In [91]:
res_for_pos= []
res_for_neg= []

train_pos = pos_context[50:100]
train_neg = neg_context[50:100]

for i in train_pos:
    res_for_pos.append(algo2(bin_inpt = i))

for i in train_neg:
    res_for_neg.append(algo2(bin_inpt = i))
results = {
        "tp": 0,
        "tn": 0,
        "fp": 0,
        "fn": 0
}

results['tp'] = res_for_pos.count(True)
results['tn'] = res_for_neg.count(False)
results['fp'] = res_for_neg.count(True)
results['fn'] = res_for_pos.count(False)
results
metrics(results)

{'Accuracy': 72.0,
 'Precision': 77.5,
 'Recall': 43.05555555555556,
 'TNR': 82.0,
 'NPV': 68.33333333333333,
 'FPR': 18.0,
 'FDR': 22.5}

In [104]:
# Игтого, методом скользящего окна:
r1 = {'Accuracy': 66.0,
 'Precision': 75.0,
 'Recall': 36.36363636363637,
 'TNR': 84.0,
 'NPV': 61.76470588235294,
 'FPR': 16.0,
 'FDR': 25.0}
r2 = {'Accuracy': 72.0,
 'Precision': 77.5,
 'Recall': 43.05555555555556,
 'TNR': 82.0,
 'NPV': 68.33333333333333,
 'FPR': 18.0,
 'FDR': 22.5}
r = {}
r['Accuracy'] = (r1['Accuracy'] + r2['Accuracy']) / 2
r['Precision'] = (r1['Precision'] + r2['Precision']) / 2
r['Recall'] = (r1['Recall'] + r2['Recall']) / 2
r['TNR'] = (r1['TNR'] + r2['TNR']) / 2
r['NPV'] = (r1['NPV'] + r2['NPV']) / 2
r['FPR'] = (r1['FPR'] + r2['FPR']) / 2
r['FDR'] = (r1['FDR'] + r2['FDR']) / 2
r

{'Accuracy': 69.0,
 'Precision': 76.25,
 'Recall': 39.70959595959596,
 'TNR': 83.0,
 'NPV': 65.04901960784314,
 'FPR': 17.0,
 'FDR': 23.75}

Для сравнения применим стандартный классификатор - решающее дерево

In [56]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(bin_train.drop(columns=['Survived']).loc[100:], bin_train['Survived'].loc[100:])
pred = clf.predict(bin_train.drop(columns=['Survived']).loc[:100])

results = {
        "tp": 0,
        "tn": 0,
        "fp": 0,
        "fn": 0
}

results['tp'] = np.sum(bin_train['Survived'].loc[:100] & pred)
results['tn'] = np.sum(bin_train['Survived'].loc[:100] + pred == 0)
results['fp'] = np.sum((bin_train['Survived'].loc[:100] == 0) & (pred == 1))
results['fn'] = np.sum((bin_train['Survived'].loc[:100] == 1) & (pred == 0))
metrics(results)

{'Accuracy': 73.07692307692307,
 'Precision': 66.66666666666666,
 'Recall': 31.57894736842105,
 'TNR': 81.25,
 'NPV': 76.47058823529412,
 'FPR': 18.75,
 'FDR': 33.33333333333333}