In [1]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [72]:
# загружаем множество объектов
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target)

# делим множество на train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

# покажем train
X_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
69,5.6,2.5,3.9,1.1
144,6.7,3.3,5.7,2.5
22,4.6,3.6,1.0,0.2
149,5.9,3.0,5.1,1.8
61,5.9,3.0,4.2,1.5


In [23]:
# простой классификатор
class SimpleClassifier:
    def __init__(self, X_train, y_train, metric_func):
        self.X_train = X_train
        self.y_train = y_train
        self.metric_func = metric_func
    
    def predict(self, sample):
        idx = self.X_train.apply(lambda row: self.metric_func(row, sample), axis=1).to_frame().sort_values(0).head(1).index[0]
        return self.y_train[idx]
    
    def predict_test_set(self, X_test):
        return X_test.apply(lambda row: self.predict(row), axis=1)

In [66]:
# средние служат для преобразования float чисел в 0/1 (нужно для метрик hamming, jacquard)
X_train_mean = X_train.mean()
y_train_mean = y_train.mean()

# метрики
metrics_list = [
    ('euclid', lambda x, y: ((x - y) ** 2).sum()),
    ('hamming', lambda x, y: ((x > X_train_mean) != (y > y_train_mean)).sum()),
    ('manhattan', lambda x, y: ((x - y).abs()).sum()),
    ('jacquard', lambda x, y: 1.0 - float(((x > X_train_mean) & (y > y_train_mean)).sum()) / (((x > X_train_mean) | (y > y_train_mean)).sum())),
    ('cosine', lambda x, y: 1.0 - x.dot(y) / np.sqrt((x ** 2).sum() * (y ** 2).sum()))
]

In [73]:
classifier = SimpleClassifier(X_train, y_train, None)

metrics_predicted = {}
for metric in metrics_list:
    classifier.metric_func = metric[1]
    metrics_predicted[metric[0]] = classifier.predict_test_set(X_test)

for name, predicted in metrics_predicted.items():
    X_test[f'metric_{name}'] = predicted
X_test['y'] = y_test
X_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[f'metric_{name}'] = predicted
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['y'] = y_test


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),metric_euclid,metric_hamming,metric_manhattan,metric_jacquard,metric_cosine,y
70,5.9,3.2,4.8,1.8,2,2,2,2,1,1
138,6.0,3.0,4.8,1.8,2,2,2,2,2,2
109,7.2,3.6,6.1,2.5,2,2,2,2,2,2
99,5.7,2.8,4.1,1.3,1,2,1,2,1,1
79,5.7,2.6,3.5,1.0,1,2,1,2,1,1
34,4.9,3.1,1.5,0.2,0,2,0,2,0,0
112,6.8,3.0,5.5,2.1,2,2,2,2,2,2
41,4.5,2.3,1.3,0.3,0,2,0,2,0,0
104,6.5,3.0,5.8,2.2,2,2,2,2,2,2
126,6.2,2.8,4.8,1.8,2,2,2,2,2,2


In [74]:
print('Точность разных метрик на данном множестве:\n')
for name, predicted in metrics_predicted.items():
    accuracy = (y_test == predicted).mean()
    print(f'metric_{name} accuracy = {accuracy}')
    # conf_matrix = sklearn.metrics.confusion_matrix(y_test, predicted)
    # sns.heatmap(conf_matrix, annot=True,annot_kws={"size": 8})

Точность разных метрик на данном множестве:

metric_euclid accuracy = 0.8666666666666667
metric_hamming accuracy = 0.5333333333333333
metric_manhattan accuracy = 0.8666666666666667
metric_jacquard accuracy = 0.5333333333333333
metric_cosine accuracy = 0.9333333333333333
