In [None]:
# -*- coding: utf-8 -*-
"""
Created on Thu Apr  3 20:08:17 2025

In [None]:
@author: kahra
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from collections import Counter

Load the Wine dataset

In [None]:
file_path = "C:/Users/kahra/Desktop/wine/wine.data"
columns = [
    'Class', 'Alcohol', 'Malic_acid', 'Ash', 'Alcalinity_of_ash', 'Magnesium',
    'Total_phenols', 'Flavanoids', 'Nonflavanoid_phenols', 'Proanthocyanins',
    'Color_intensity', 'Hue', 'OD280/OD315_of_diluted_wines', 'Proline'
]

In [None]:
df = pd.read_csv(file_path, header=None, names=columns)

Separate features and target labels

In [None]:
X = df.iloc[:, 1:].values  # Feature matrix
y = df.iloc[:, 0].values   # Class labels

Standardize the features

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

Split the dataset into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

Define Euclidean and Manhattan distance functions

In [None]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [None]:
def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

Implement k-NN classifier

In [None]:
def knn(X_train, y_train, X_test, k, distance_metric='euclidean'):
    predictions = []
    for test_point in X_test:
        if distance_metric == 'euclidean':
            distances = [euclidean_distance(test_point, train_point) for train_point in X_train]
        else:
            distances = [manhattan_distance(test_point, train_point) for train_point in X_train]
        
        # Get k nearest neighbors
        sorted_indices = np.argsort(distances)[:k]
        nearest_labels = y_train[sorted_indices]
        
        # Predict the most common class
        predicted_label = Counter(nearest_labels).most_common(1)[0][0]
        predictions.append(predicted_label)
    
    return np.array(predictions)

Evaluate the model for different k values

In [None]:
k_values = [1, 3, 5, 7, 9,11,13,15,17,19,21,23,25,27,29]
accuracies_euclidean = []
accuracies_manhattan = []

In [None]:
results_euclidean = []
results_manhattan = []

In [None]:
for k in k_values:
    y_pred_euc = knn(X_train, y_train, X_test, k, distance_metric='euclidean')
    y_pred_man = knn(X_train, y_train, X_test, k, distance_metric='manhattan')
    
    acc_euc = accuracy_score(y_test, y_pred_euc)
    acc_man = accuracy_score(y_test, y_pred_man)
    
    accuracies_euclidean.append(acc_euc)
    accuracies_manhattan.append(acc_man)
    
    results_euclidean.append([k, acc_euc])
    results_manhattan.append([k, acc_man])

Display results in tabular format

In [None]:
print("\nEuclidean Distance Results:")
print(pd.DataFrame(results_euclidean, columns=["K Value", "Accuracy"]))

In [None]:
print("\nManhattan Distance Results:")
print(pd.DataFrame(results_manhattan, columns=["K Value", "Accuracy"]))

Plot accuracy vs. k values

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(k_values, accuracies_euclidean, marker='o', label='Euclidean')
plt.xlabel("K Value")
plt.ylabel("Accuracy")
plt.title("Euclidean Distance: Model Performance vs. K Value")
plt.legend()
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(k_values, accuracies_manhattan, marker='s', label='Manhattan')
plt.xlabel("K Value")
plt.ylabel("Accuracy")
plt.title("Manhattan Distance: Model Performance vs. K Value")
plt.legend()
plt.grid()
plt.show()

Determine the best k value

In [None]:
best_k = k_values[np.argmax(accuracies_euclidean)]

Generate confusion matrix and classification report for the best k

In [None]:
final_predictions = knn(X_train, y_train, X_test, best_k, distance_metric='euclidean')
print(f"Best K value: {best_k}")
print(confusion_matrix(y_test, final_predictions))
print(classification_report(y_test, final_predictions))

In [None]:
column_names = [
    "Class", "Alcohol", "Malic_acid", "Ash", "Alcalinity_of_ash", "Magnesium",
    "Total_phenols", "Flavanoids", "Nonflavanoid_phenols", "Proanthocyanins",
    "Color_intensity", "Hue", "OD280/OD315_of_diluted_wines", "Proline"
]

In [None]:
df = pd.read_csv(file_path, header=None, names=column_names)

In [None]:
plt.figure(figsize=(15, 5))

In [None]:
plt.subplot(1, 3, 1)
sns.histplot(df["Alcohol"], bins=20, kde=True, color="blue")
plt.title("Alcohol Distribution")

In [None]:
plt.subplot(1, 3, 2)
sns.histplot(df["Color_intensity"], bins=20, kde=True, color="red")
plt.title("Color Intensity Distribution")

In [None]:
plt.subplot(1, 3, 3)
sns.histplot(df["Proline"], bins=20, kde=True, color="green")
plt.title("Proline Distribution")

In [None]:
plt.tight_layout()
plt.show()

Scatter Plot 

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x="Alcohol", y="Color_intensity", hue="Class", palette="viridis")
plt.title("Alcohol and Color Intensity Relation")
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x="Alcohol", y="Proline", hue="Class", palette="coolwarm")
plt.title("Alcohol ve Proline Relation")
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x="Color_intensity", y="Proline", hue="Class", palette="coolwarm")
plt.title("Color Intensity ve Proline Relation")
plt.show()