In [32]:
import csv
import numpy as np
from sklearn.model_selection import train_test_split

n_features = 5
n_samples = 168
n_classes = 2
# A - 0, B - 1
# ввели исходные данные в массивы X и Y
X = np.empty((n_samples, n_features))
Y = np.empty(n_samples)

with open("ex2.data", newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
   
    for row in reader:
        X[reader.line_num-1] = np.array(row[:len(row)-1])
        Y[reader.line_num-1] = 0 if row[len(row)-1] == 'A' else 1

        
#Математическое ожидаине для каждого из признаков
attribute_means = X.mean(axis=0)
#проверка что количесвто признаков соответствует заданному
assert attribute_means.shape == (n_features,)
#бинаризация признаков
X_d = np.array(X >= attribute_means, dtype='int')
random_state = 14  # так, чтобы получить одинак. рез-ты
#разделение на тестовую и обучающие выборки
X_train, X_test, y_train, y_test = train_test_split(X_d, Y, random_state=random_state)
print("There are {} training samples".format(y_train.shape))
print("There are {} testing samples".format(y_test.shape))


from collections import defaultdict
from operator import itemgetter

def train(X, y_true, feature):
    # проверка валидный ли номер признака
    n_samples, n_features = X.shape
    assert 0 <= feature < n_features
    # получение всех уникальных значений для признака 
    # после бинаризации значение 1 и 0
    values = set(X[:,feature])
    # Stores the predictors array that is returned
    predictors = dict()
    errors = []
    #для каждого из значений признака вычисляется:
    #most_frequent_class  класс, который чаще встречается при данном значении признака
    #error - ошибка при подоюном правиле
    for current_value in values:
        most_frequent_class, error = train_feature_value(X, y_true, feature, current_value)
        predictors[current_value] = most_frequent_class
        errors.append(error)
    # Compute the total error of using this feature to classify on
    total_error = sum(errors)
    return predictors, total_error

# Compute what our predictors say each sample is based on its value
#y_predicted = np.array([predictors[sample[feature]] for sample in X])
    

def train_feature_value(X, y_true, feature, value):
    # Create a simple dictionary to count how frequency they give certain predictions
    class_counts = defaultdict(int)
    # Iterate through each sample and count the frequency of each class/value pair
    # для каждого значения признака считаются появляющиеся классы
    for sample, y in zip(X, y_true):
        if sample[feature] == value:
            class_counts[y] += 1       
    # Now get the best one by sorting (highest first) and choosing the first item
    # выбор наиболее частовстречающегося класса - считаем, что данное значение данного признака 
    # соотеветствует этому классу
    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_counts[0][0]
    # The error is the number of samples that do not classify as the most frequent class
    # *and* have the feature value.
    n_samples = X.shape[1] #выгляди как ненужная строка
    # ошибка - количесвто примеров, где класс другой
    error = sum([class_count for class_value, class_count in class_counts.items()
                 if class_value != most_frequent_class])
    return most_frequent_class, error

# Compute all of the predictors
# вычислили правило и ошибку для каждого признака
all_predictors = {variable: train(X_train, y_train, variable) for variable in range(X_train.shape[1])}
errors = {variable: error for variable, (mapping, error) in all_predictors.items()}
# Now choose the best and save that as "model"
# Sort by error
# выбрали наилучшее по минимальной ошибке
best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]
print("The best model is based on variable {0} and has error {1:.2f}".format(best_variable, best_error))

# Choose the bset model
model = {'variable': best_variable,
         'predictor': all_predictors[best_variable][0]}
print(model)


There are (126,) training samples
There are (42,) testing samples
The best model is based on variable 0 and has error 20.00
{'variable': 0, 'predictor': {0: 1.0, 1: 0.0}}


In [15]:
def predict(X_test, model):
    variable = model['variable']
    predictor = model['predictor']
    y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
    return y_predicted
y_predicted = predict(X_test, model)
# print(y_predicted)

# Compute the accuracy by taking the mean of the amounts that y_predicted is equal to y_test
accuracy = np.mean(y_predicted == y_test) * 100
print("The test accuracy is {:.1f}%".format(accuracy))

The test accuracy is 95.2%


In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95        21
         1.0       1.00      0.90      0.95        21

   micro avg       0.95      0.95      0.95        42
   macro avg       0.96      0.95      0.95        42
weighted avg       0.96      0.95      0.95        42

