In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('/content/WineDataset.csv', delimiter=',')
print(data)

     Alcohol  Malic Acid   Ash  Alcalinity of ash  Magnesium  Total phenols  \
0      14.23        1.71  2.43               15.6      127.0           2.80   
1      13.20        1.78  2.14               11.2      100.0           2.65   
2      13.16        2.36  2.67               18.6      101.0           2.80   
3      14.37        1.95  2.50               16.8      113.0           3.85   
4      13.24        2.59  2.87               21.0      118.0           2.80   
..       ...         ...   ...                ...        ...            ...   
173    13.71        5.65  2.45               20.5       95.0           1.68   
174    13.40        3.91  2.48               23.0      102.0           1.80   
175    13.27        4.28  2.26               20.0      120.0           1.59   
176    13.17        2.59  2.37               20.0      120.0           1.65   
177    14.13        4.10  2.74               24.5       96.0           2.05   

     Flavanoids  Nonflavanoid phenols  Proanthocyan

In [None]:
missing_values = data.isnull().sum()
print(missing_values)



1.   **Отсутвующих значений нет :)**
2. ***Категориальных признаков нет :)***






In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Масштабирование признаков
scaler = StandardScaler()
# Исключаем столбец "Wine" (последний)
data.iloc[:, :-1] = scaler.fit_transform(data.iloc[:, :-1])
print(data.iloc[:,:-1])
# Разделение данных на обучающий и тестовый наборы
X = data.iloc[:, :-1]
y = data['Wine']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

      Alcohol  Malic Acid       Ash  Alcalinity of ash  Magnesium  \
0    1.518613   -0.562250  0.232053          -1.169593   1.913905   
1    0.246290   -0.499413 -0.827996          -2.490847   0.018145   
2    0.196879    0.021231  1.109334          -0.268738   0.088358   
3    1.691550   -0.346811  0.487926          -0.809251   0.930918   
4    0.295700    0.227694  1.840403           0.451946   1.281985   
..        ...         ...       ...                ...        ...   
173  0.876275    2.974543  0.305159           0.301803  -0.332922   
174  0.493343    1.412609  0.414820           1.052516   0.158572   
175  0.332758    1.744744 -0.389355           0.151661   1.422412   
176  0.209232    0.227694  0.012732           0.151661   1.422412   
177  1.395086    1.583165  1.365208           1.502943  -0.262708   

     Total phenols  Flavanoids  Nonflavanoid phenols  Proanthocyanins  \
0         0.808997    1.034819             -0.659563         1.224884   
1         0.568648    0.7

In [None]:


def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

#класс KNN для реализации метода k-ближайших соседей.
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common

    def predict_batch(self, X):
        predictions = [self.predict(x) for x in X]
        return np.array(predictions)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score
from prettytable import PrettyTable
import random

result = PrettyTable()
result.field_names = ["Model","K", "Accuracy", "F1 Score", "Confusion Matrix"]

# Модель 1. Случайный набор признаков

random_features = random.sample(X_train.columns.tolist(), 5)
print(f"Random Features: {random_features}")

# Модель 2. Первые 5 признаков
fixed_features = X_train.columns.tolist()[:5]
print(f"Fixid Features: {fixed_features}")

# Значения k
k_values = [2, 3, 5, 10, 12, 15, 20,  25]

for k in k_values:
    X_train_random = X_train[random_features]
    X_test_random = X_test[random_features]

    knn_random = knn_fixed = KNN(k=k)
    knn_random.fit(X_train_random.to_numpy(), y_train.to_numpy())
    predictions_random = knn_random.predict_batch(X_test_random.to_numpy())
    # Оценка производительности модели
    accuracy = accuracy_score(y_test, predictions_random)
    f1 = f1_score(y_test, predictions_random, average='weighted')
    # Построение матрицы ошибок
    cm = confusion_matrix(y_test, predictions_random)

    result.add_row(["Random Features",k, round(accuracy,4), round(f1,4), cm])

    X_train_fixed = X_train[fixed_features]
    X_test_fixed = X_test[fixed_features]

    knn_fixed.fit(X_train_fixed.to_numpy(), y_train.to_numpy())
    predictions_fixed = knn_fixed.predict_batch(X_test_fixed.to_numpy())
    # Оценка производительности модели
    accuracy = accuracy_score(y_test, predictions_fixed)
    f1 = f1_score(y_test, predictions_fixed, average='weighted')
    # Построение матрицы ошибок
    cm = confusion_matrix(y_test, predictions_fixed)
    result.add_row(["Fixed Features",k, round(accuracy,4), round(f1,4), cm],divider=True)


print(result)




Random Features: ['Total phenols', 'OD280/OD315 of diluted wines', 'Proline', 'Magnesium', 'Alcalinity of ash']
Fixid Features: ['Alcohol', 'Malic Acid', 'Ash', 'Alcalinity of ash', 'Magnesium']
+-----------------+----+----------+----------+------------------+
|      Model      | K  | Accuracy | F1 Score | Confusion Matrix |
+-----------------+----+----------+----------+------------------+
| Random Features | 2  |  0.9444  |  0.9423  |   [[14  0  0]    |
|                 |    |          |          |    [ 0 14  0]    |
|                 |    |          |          |    [ 0  2  6]]   |
|  Fixed Features | 2  |  0.8056  |  0.7974  |   [[12  0  2]    |
|                 |    |          |          |    [ 1 13  0]    |
|                 |    |          |          |    [ 1  3  4]]   |
+-----------------+----+----------+----------+------------------+
| Random Features | 3  |  0.9444  |  0.9423  |   [[14  0  0]    |
|                 |    |          |          |    [ 0 14  0]    |
|            