In [None]:
from random import seed, randrange
from csv import reader
from math import sqrt

# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

# Find the min and max values for each column
def dataset_min_max(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax

# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = sum(1 for i in range(len(actual)) if actual[i] == predicted[i])
    return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = [row for fold_subset in folds if fold_subset is not fold for row in fold_subset]
        test_set = [list(row) for row in fold]  # Copy rows for testing
        for row in test_set:
            row[-1] = None  # Remove class label
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

# Calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
    distance = sum((row1[i] - row2[i]) ** 2 for i in range(len(row1) - 1))
    return sqrt(distance)


In [6]:
from random import seed, randrange
from csv import reader
from math import sqrt

def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# Convert string columns to float and adjust class label position
def preprocess_dataset(dataset):
    for row in dataset:
        row[:] = [float(x) for x in row]  # Convert all to float
        row.append(row.pop(0))  # Move class label to the end

# Find the min and max values for each column
def dataset_minmax(dataset):
    minmax = [[min(col), max(col)] for col in zip(*dataset)]
    return minmax

# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)-1):  # Exclude class label
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size and dataset_copy:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = sum(1 for a, p in zip(actual, predicted) if a == p)
    return correct / len(actual) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = []
    for fold in folds:
        train_set = [row for f in folds if f is not fold for row in f]
        test_set = [list(row) for row in fold]
        for row in test_set:
            row[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        scores.append(accuracy_metric(actual, predicted))
    return scores

# Calculate Euclidean distance
def euclidean_distance(row1, row2):
    return sqrt(sum((row1[i] - row2[i])**2 for i in range(len(row1)-1)))

# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
    distances = [(train_row, euclidean_distance(test_row, train_row)) for train_row in train]
    distances.sort(key=lambda tup: tup[1])
    return [distances[i][0] for i in range(num_neighbors)]

# Make a prediction with neighbors
def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    return max(set(output_values), key=output_values.count)

# kNN Algorithm
def k_nearest_neighbors(train, test, num_neighbors):
    return [predict_classification(train, row, num_neighbors) for row in test]

# Load and prepare data
seed(1)
filename = './hayuci13a/yXT_wine.csv'  # 修正されたデータセット名
dataset = load_csv(filename)
preprocess_dataset(dataset)  # データを変換
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)

# Evaluate algorithm
n_folds = 5
num_neighbors = 5
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)
print(f'Scores: {scores}')
print(f'Mean Accuracy: {sum(scores)/len(scores):.3f}%')


Scores: [97.14285714285714, 91.42857142857143, 94.28571428571428, 94.28571428571428, 100.0]
Mean Accuracy: 95.429%
