<a href="https://colab.research.google.com/github/AbhishekKurra/Machine-Learning-Assignment-1/blob/main/Abhishek%20Kurra%20Programing%20Assignment%201.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import numpy as np
import pandas as pd
from collections import Counter
from scipy import stats
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
import warnings
import requests
import os

warnings.filterwarnings('ignore')

class KNN:
    def __init__(self, k=3, distance_metric='euclidean', weights='uniform', kernel=None):
        self.k = k
        self.distance_metric = distance_metric
        self.weights = weights
        self.kernel = kernel

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def _predict(self, x):

        if self.distance_metric == 'euclidean':
            distances = [self._euclidean_distance(x, x_train) for x_train in self.X_train]
        elif self.distance_metric == 'manhattan':
            distances = [self._manhattan_distance(x, x_train) for x_train in self.X_train]
        elif self.distance_metric == 'minkowski':
            distances = [self._minkowski_distance(x, x_train, 3) for x_train in self.X_train]
        else:
            raise ValueError("Distance metric not supported")


        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        k_distances = [distances[i] for i in k_indices]


        if self.weights == 'uniform':
            most_common = Counter(k_nearest_labels).most_common(1)
            return most_common[0][0]
        else:
            if self.weights != 'distance':
                raise ValueError("Weighting method not supported")



            k_weights = [1/(d + 1e-10) for d in k_distances]


            if self.kernel == 'gaussian':
                k_weights = [np.exp(-d**2) for d in k_distances]
            elif self.kernel == 'epanechnikov':
                k_weights = [max(0, 1 - d**2) for d in k_distances]


            weighted_votes = {}
            for i, label in enumerate(k_nearest_labels):
                if label not in weighted_votes:
                    weighted_votes[label] = 0
                weighted_votes[label] += k_weights[i]

            return max(weighted_votes.items(), key=lambda x: x[1])[0]

    def _euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2)**2))

    def _manhattan_distance(self, x1, x2):
        return np.sum(np.abs(x1 - x2))

    def _minkowski_distance(self, x1, x2, p):
        return np.sum(np.abs(x1 - x2)**p)**(1/p)

def knn_cross_validate(X, y, k=5, k_neighbors=3, distance_metric='euclidean', weights='uniform', kernel=None, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]


        knn = KNN(k=k_neighbors, distance_metric=distance_metric, weights=weights, kernel=kernel)
        knn.fit(X_train, y_train)


        predictions = knn.predict(X_test)


        accuracy = np.sum(predictions == y_test) / len(y_test)
        accuracies.append(accuracy)

    return np.mean(accuracies), accuracies

def compare_with_sklearn(X, y, k_neighbors=3, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    our_accuracies = []
    sklearn_accuracies = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]


        knn = KNN(k=k_neighbors)
        knn.fit(X_train, y_train)
        our_predictions = knn.predict(X_test)
        our_accuracy = np.sum(our_predictions == y_test) / len(y_test)
        our_accuracies.append(our_accuracy)


        sklearn_knn = KNeighborsClassifier(n_neighbors=k_neighbors)
        sklearn_knn.fit(X_train, y_train)
        sklearn_predictions = sklearn_knn.predict(X_test)
        sklearn_accuracy = accuracy_score(y_test, sklearn_predictions)
        sklearn_accuracies.append(sklearn_accuracy)


    t_stat, p_value = stats.ttest_rel(our_accuracies, sklearn_accuracies)

    return our_accuracies, sklearn_accuracies, t_stat, p_value


def preprocess_hayes_roth(df):

    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values


    le = LabelEncoder()
    for i in range(X.shape[1]):
        X[:, i] = le.fit_transform(X[:, i])

    y = le.fit_transform(y)

    return X, y

def preprocess_car_evaluation(df):

    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values


    le = LabelEncoder()
    for i in range(X.shape[1]):
        X[:, i] = le.fit_transform(X[:, i])

    y = le.fit_transform(y)

    return X, y

def preprocess_breast_cancer(df):


    df = df.drop(df.columns[0], axis=1)


    X = df.iloc[:, 1:-1].values
    y = df.iloc[:, -1].values


    y = np.where(y == 'M', 1, 0)


    X = np.where(X == '?', np.nan, X).astype(float)
    col_means = np.nanmean(X, axis=0)
    inds = np.where(np.isnan(X))
    X[inds] = np.take(col_means, inds[1])


    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return X, y


def download_file(url, filename):
    response = requests.get(url)
    response.raise_for_status()
    with open(filename, 'wb') as f:
        f.write(response.content)


    if not os.path.exists('hayes-roth.data'):
        print("Downloading hayes-roth.data...")
        download_file(hayes_roth_url, 'hayes-roth.data')
    if not os.path.exists('car.data'):
        print("Downloading car.data...")
        download_file(car_url, 'car.data')
    if not os.path.exists('wdbc.data'):
        print("Downloading wdbc.data...")
        download_file(bc_url, 'wdbc.data')



    hayes_roth_columns = ['name', 'hobby', 'age', 'educational level', 'marital status', 'class']
    hayes_roth_df = pd.read_csv('hayes-roth.data', names=hayes_roth_columns)
    X_hr, y_hr = preprocess_hayes_roth(hayes_roth_df)


    car_columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
    car_df = pd.read_csv('car.data', names=car_columns)
    X_car, y_car = preprocess_car_evaluation(car_df)


    bc_columns = ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
                 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean',
                 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se',
                 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se',
                 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst',
                 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst',
                 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
    bc_df = pd.read_csv('wdbc.data', names=bc_columns)
    X_bc, y_bc = preprocess_breast_cancer(bc_df)

    return {
        'Hayes-Roth': (X_hr, y_hr),
        'Car Evaluation': (X_car, y_car),
        'Breast Cancer': (X_bc, y_bc)
    }


if __name__ == "__main__":

    datasets = load_and_process_datasets()

    results = {}

    for name, (X, y) in datasets.items():
        print(f"Processing {name} dataset...")


        our_acc, sklearn_acc, t_stat, p_value = compare_with_sklearn(X, y, k_neighbors=5)


        results[name] = {
            'our_mean_accuracy': np.mean(our_acc),
            'sklearn_mean_accuracy': np.mean(sklearn_acc),
            't_statistic': t_stat,
            'p_value': p_value,
            'our_accuracies': our_acc,
            'sklearn_accuracies': sklearn_acc
        }

        print(f"{name} - Our KNN: {np.mean(our_acc):.4f}, Sklearn KNN: {np.mean(sklearn_acc):.4f}")
        print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")
        print("="*50)


    enhanced_results = {}

    for name, (X, y) in datasets.items():
        print(f"Testing enhanced KNN on {name} dataset...")




        enhanced_accuracies = []
        for config in configs:
            mean_acc, accuracies = knn_cross_validate(
                X, y,
                k=config['k'],
                k_neighbors=config['k'],
                distance_metric=config['distance_metric'],
                weights=config['weights'],
                kernel=config['kernel']
            )
            enhanced_accuracies.append((config, mean_acc))
            print(f"Config: {config} -> Accuracy: {mean_acc:.4f}")



        enhanced_results[name] = enhanced_accuracies
        print("="*50)

Processing Hayes-Roth dataset...
Hayes-Roth - Our KNN: 0.4247, Sklearn KNN: 0.3934
T-statistic: 1.1056, P-value: 0.2976
Processing Car Evaluation dataset...
Car Evaluation - Our KNN: 0.9259, Sklearn KNN: 0.8663
T-statistic: 5.9304, P-value: 0.0002
Processing Breast Cancer dataset...
Breast Cancer - Our KNN: 1.0000, Sklearn KNN: 1.0000
T-statistic: nan, P-value: nan
Testing enhanced KNN on Hayes-Roth dataset...
Config: {'k': 3, 'distance_metric': 'euclidean', 'weights': 'uniform', 'kernel': None} -> Accuracy: 0.4319
Config: {'k': 5, 'distance_metric': 'manhattan', 'weights': 'distance', 'kernel': None} -> Accuracy: 0.4390
Config: {'k': 7, 'distance_metric': 'minkowski', 'weights': 'distance', 'kernel': 'gaussian'} -> Accuracy: 0.3929
Config: {'k': 5, 'distance_metric': 'euclidean', 'weights': 'distance', 'kernel': 'epanechnikov'} -> Accuracy: 0.3863
Testing enhanced KNN on Car Evaluation dataset...
Config: {'k': 3, 'distance_metric': 'euclidean', 'weights': 'uniform', 'kernel': None} ->