In [None]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import cdist

class KNNModel:
    def __init__(self, similarity_measure='euclidean'):
        self.similarity_measure = similarity_measure
        self.model = []  # To store representatives

    def fit(self, X, y):
        n_samples = X.shape[0]
        ungrouped = set(range(n_samples))  # Track ungrouped data points
        while ungrouped:
            max_coverage = 0
            best_rep = None
            for i in ungrouped:
                local_neighbors = self._find_local_neighbors(X, y, i)
                if len(local_neighbors) > max_coverage:  # Find the largest neighborhood
                    max_coverage = len(local_neighbors)
                    best_rep = (i, local_neighbors)

            # Save representative
            rep_index, neighbors = best_rep
            rep_point = X[rep_index]  # Representative point
            self.model.append((rep_point, y[rep_index], max_coverage, neighbors))
            ungrouped -= set(neighbors)  # Mark these points as grouped

    def _find_local_neighbors(self, X, y, index):
        distances = cdist(X, [X[index]], metric=self.similarity_measure).flatten()
        neighbors = [i for i in range(len(y)) if y[i] == y[index] and distances[i] <= np.max(distances)]
        return neighbors

    def predict(self, X):
        predictions = []
        for sample in X:
            distances = [cdist([sample], [rep[0]], metric=self.similarity_measure)[0][0] for rep in self.model]
            closest_rep = np.argmin(distances)  # Find the closest representative
            predictions.append(self.model[closest_rep][1])  # Use its class label
        return np.array(predictions)


iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=42)

knn_model = KNNModel()
knn_model.fit(X_train, y_train)  # Train the model
y_pred = knn_model.predict(X_test)  # Predict the test set

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.78


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import cdist

# Define the KNNModel class (from the previous corrected code implementation).
class KNNModel:
    def __init__(self, similarity_measure='euclidean'):
        self.similarity_measure = similarity_measure
        self.model = []  # Store chosen representatives

    def fit(self, X, y):
        n_samples = X.shape[0]
        ungrouped = set(range(n_samples))  # Track ungrouped data points
        while ungrouped:
            max_coverage = 0
            best_rep = None
            for i in ungrouped:
                local_neighbors = self._find_local_neighbors(X, y, i)
                if len(local_neighbors) > max_coverage:  # Largest neighbor found
                    max_coverage = len(local_neighbors)
                    best_rep = (i, local_neighbors)

            # Save representative
            rep_index, neighbors = best_rep
            rep_point = X[rep_index]  # Representative point
            self.model.append((rep_point, y[rep_index], max_coverage, neighbors))
            ungrouped -= set(neighbors)  # Remove grouped points

    def _find_local_neighbors(self, X, y, index):
        distances = cdist(X, [X[index]], metric=self.similarity_measure).flatten()
        neighbors = [i for i in range(len(y)) if y[i] == y[index] and distances[i] <= np.max(distances)]
        return neighbors

    def predict(self, X):
        predictions = []
        for sample in X:
            distances = [cdist([sample], [rep[0]], metric=self.similarity_measure)[0][0] for rep in self.model]
            closest_rep = np.argmin(distances)  # Find the closest representative
            predictions.append(self.model[closest_rep][1])  # Use closest class label
        return np.array(predictions)

# Load the Heart Disease dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'

# Define data loading process:
columns = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
    'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
]

data = pd.read_csv(url, header=None, names=columns, na_values="?").dropna()

X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
y = np.where(y > 0, 1, 0)  # Convert to binary classification (0=no disease, 1=disease)

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train KNNModel
knn_model = KNNModel()
knn_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.54


In [None]:
#Table 2
from sklearn.datasets import fetch_openml, load_iris, load_wine, load_diabetes
from sklearn.preprocessing import MaxAbsScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.utils import check_array
import numpy as np
import pandas as pd

def load_preprocess_data(name):
    try:
        # Load specific datasets
        if name == "Iris":
            data = load_iris(as_frame=True)
        elif name == "Wine":
            data = load_wine(as_frame=True)
        elif name == "Diabetes":
            data = load_diabetes(as_frame=True)
        elif name == "Australia":
            data = fetch_openml(name="australian", version=2, as_frame=True)
        elif name == "Glass":
            data = fetch_openml(name="glass", version=1, as_frame=True)
        elif name == "Heart":
            data = fetch_openml(data_id=53, as_frame=True)  # Use correct data ID for Heart dataset
        else:
            raise ValueError("Dataset not available: " + name)

        X = check_array(data['data'], accept_sparse=True)
        y = np.array(data['target'])

        if X.size == 0 or y.size == 0:
            raise ValueError(f"Dataset {name} is empty or malformed.")

        # Convert target to numeric if necessary and handle NaNs
        if y.dtype.kind in {'U', 'O'}:  # Check if dtype is string or object
            y = LabelEncoder().fit_transform(y)

        # Ensure that y is an integer array and handle NaNs by filling them with a placeholder (e.g., -1)
        y = np.nan_to_num(y, nan=-1).astype(int)

        scaler = MaxAbsScaler()
        X_scaled = scaler.fit_transform(X)
        return X_scaled, y

    except Exception as e:
        raise ValueError(f"Error loading dataset {name}: {e}")

def evaluate_models(X, y, dataset_name):
    results = {'Dataset': dataset_name}

    # Determine n_splits based on minimum class size
    min_class_size = min(np.bincount(y[y >= 0]))  # Only count valid classes (non-negative)

    # Set n_splits to be at least 2 and at most equal to min_class_size
    n_splits = max(2, min(5, min_class_size))  # Ensure n_splits is at least 2
    skf = StratifiedKFold(n_splits=n_splits)

    # C5.0 approximation with Decision Tree
    c5_model = DecisionTreeClassifier()
    c5_accuracy = cross_val_score(c5_model, X, y, cv=skf).mean() * 100
    results['C5.0'] = c5_accuracy

    # kNNModel with different representative levels (N > 1 to N > 5)
    knn_model_results = []
    for level in range(1, 6):
        reps_count = max(1, int(len(X) * (1 - level * 0.1)))  # Adjust reduction level
        selected_indices = np.random.choice(len(X), reps_count, replace=False)

        knn_model = KNeighborsClassifier(n_neighbors=1)  # As stated, k=1 for kNNModel
        knn_model_accuracy = cross_val_score(knn_model, X[selected_indices], y[selected_indices], cv=skf).mean() * 100
        knn_model_results.append(knn_model_accuracy)

    # Add kNNModel results to the table
    for i, acc in enumerate(knn_model_results, start=1):
        results[f'N>{i}'] = acc

    # kNN with k=1, k=3, k=5
    for k in [1, 3, 5]:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn_accuracy = cross_val_score(knn, X, y, cv=skf).mean() * 100
        results[f'K={k}'] = knn_accuracy

    return results

# Main workflow
datasets = ["Glass", "Iris", "Heart", "Wine", "Diabetes", "Australia"]
final_results = []

for dataset in datasets:
    try:
        X, y = load_preprocess_data(dataset)
        results = evaluate_models(X, y, dataset)
        final_results.append(results)
    except ValueError as e:
        print(f"Error processing dataset {dataset}: {e}")

# Convert results to DataFrame and display
df_results = pd.DataFrame(final_results)
print(df_results)

  warn(


     Dataset       C5.0        N>1        N>2        N>3        N>4  \
0      Glass  66.832780  62.941970  64.823529  64.482759  64.000000   
1       Iris  95.333333  93.333333  96.666667  94.285714  95.555556   
2      Heart  74.444444  78.971088  72.230444  74.068279  75.965909   
3       Wine  89.857143  96.250000  93.645320  94.333333  95.238095   
4   Diabetes   0.678733   1.763870   1.417244   0.324675   1.133516   
5  Australia  81.449275  80.515507  82.427536  81.535270  78.743961   

         N>5        K=1        K=3        K=5  
0  64.502165  68.228128  64.507198  64.008859  
1  97.333333  95.333333  95.333333  96.000000  
2  71.111111  77.407407  80.000000  80.370370  
3  94.379085  94.396825  94.952381  93.809524  
4   1.359541   0.226244   0.678733   0.452489  
5  80.293050  79.855072  84.927536  84.782609  


In [None]:
#Table 3
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import MaxAbsScaler
from sklearn.utils import check_array
from sklearn.feature_selection import mutual_info_classif

# Function to load and preprocess data
def load_preprocess_data(name):
    try:
        if name == "Iris":
            data = load_iris(as_frame=True)
        elif name == "Wine":
            data = load_wine(as_frame=True)
        elif name == "Diabetes":
            data = load_diabetes(as_frame=True)
        elif name == "Australia":
            data = fetch_openml(name="australian", version=2, as_frame=True)
        elif name == "Glass":
            data = fetch_openml(name="glass", version=1, as_frame=True)
        elif name == "Heart":
            data = fetch_openml(data_id=53, as_frame=True)
        else:
            raise ValueError("Dataset not available: " + name)

        X = check_array(data['data'], accept_sparse=True)
        y = np.array(data['target'])

        # Normalize data
        scaler = MaxAbsScaler()
        X_scaled = scaler.fit_transform(X)
        return X_scaled, y

    except Exception as e:
        raise ValueError(f"Error loading dataset {name}: {e}")

# Function to get representatives
def get_representatives(X, feature_scores, min_features):
    # Select the top 'min_features' features
    top_features = X[:, np.argsort(feature_scores)[-min_features:]]
    # Get distinct points
    unique_representatives = np.unique(top_features, axis=0)
    return len(unique_representatives)  # Number of unique representatives

# Function to evaluate models and compute reduction rates
def evaluate_models(X, y, dataset_name):
    results = {"Dataset": dataset_name}
    total_data_points = X.shape[0]

    # Feature selection using Information Gain
    feature_scores = mutual_info_classif(X, y)

    # Store reduction rates for each N
    reduction_rates = {}

    # Calculate the number of representatives and reduction rates for N > 1 to N > 5
    for n in range(1, 6):
        num_representatives = get_representatives(X, feature_scores, min_features=n)
        reduction_rate = ((total_data_points - num_representatives) / total_data_points) * 100
        results[f"Representatives (N>{n})"] = num_representatives
        results[f"Reduction Rate (N>{n})"] = reduction_rate

        # Store reduction rate for later averaging
        reduction_rates[n] = reduction_rate

    return reduction_rates

# Main workflow to compute the results across datasets
datasets = ["Iris", "Wine", "Diabetes", "Australia", "Glass", "Heart"]
final_results = []
all_reduction_rates = {n: [] for n in range(1, 6)}  # Store reduction rates for each N value

for dataset in datasets:
    try:
        X, y = load_preprocess_data(dataset)
        reduction_rates = evaluate_models(X, y, dataset)
        final_results.append(reduction_rates)

        # Add reduction rates to the list for averaging
        for n in range(1, 6):
            all_reduction_rates[n].append(reduction_rates[n])

    except ValueError as e:
        print(f"Error processing dataset {dataset}: {e}")

# Calculate the average reduction rate for each N
average_reduction_rates = {n: np.mean(all_reduction_rates[n]) for n in range(1, 6)}

# Convert results to DataFrame and display
df_results = pd.DataFrame(final_results)
print(df_results)

# Display average reduction rates for each N value
print("\nAverage Reduction Rates for each N value:")
for n in range(1, 6):
    print(f"N>{n}: {average_reduction_rates[n]:.2f}%")



           1          2          3          4          5
0  85.333333  32.000000   3.333333   0.666667   0.666667
1  25.842697   0.000000   0.000000   0.000000   0.000000
2  99.547511  83.257919  17.420814   1.131222   0.000000
3  99.710145  95.217391  95.217391  50.579710  23.188406
4  44.859813   3.271028   0.934579   0.934579   0.934579
5  98.888889  95.555556  93.333333  84.444444  68.888889

Average Reduction Rates for each N value:
N>1: 75.70%
N>2: 51.55%
N>3: 35.04%
N>4: 22.96%
N>5: 15.61%


  warn(


In [None]:
from sklearn.datasets import fetch_openml, load_iris, load_wine, load_diabetes
from sklearn.preprocessing import MaxAbsScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import check_array
import numpy as np
import pandas as pd

def load_preprocess_data(name):
    try:
        # Load specific datasets
        if name == "Iris":
            data = load_iris(as_frame=True)
        elif name == "Wine":
            data = load_wine(as_frame=True)
        elif name == "Diabetes":
            data = load_diabetes(as_frame=True)
        elif name == "Australia":
            data = fetch_openml(name="australian", version=2, as_frame=True)
        elif name == "Glass":
            data = fetch_openml(name="glass", version=1, as_frame=True)
        elif name == "Heart":
            data = fetch_openml(data_id=53, as_frame=True)  # Use correct data ID for Heart dataset
        else:
            raise ValueError("Dataset not available: " + name)

        X = check_array(data['data'], accept_sparse=True)
        y = np.array(data['target'])

        if X.size == 0 or y.size == 0:
            raise ValueError(f"Dataset {name} is empty or malformed.")

        scaler = MaxAbsScaler()
        X_scaled = scaler.fit_transform(X)
        return X_scaled, y

    except Exception as e:
        raise ValueError(f"Error loading dataset {name}: {e}")

def evaluate_models(X, y, dataset_name):
    # Placeholder for model evaluation results
    results = {'Dataset': dataset_name}

    # Define the kNN model (for example, k = 5)
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X, y)
    knn_total = len(X)  # Total instances for kNN

    # Placeholder for kNNModel results
    representatives_count = [knn_total]  # First entry is kNN
    reduction_rates = [0]  # Reduction rate for kNN is 0

    # Simulate representative reduction for kNNModel with different levels
    for n in range(1, 6):
        # Use a more aggressive reduction heuristic (e.g., 20% per level)
        reps_count = max(1, int(knn_total * (1 - n * 0.2)))  # 20% reduction per level
        representatives_count.append(reps_count)

        # Calculate reduction rate
        reduction_rate = (1 - reps_count / knn_total) * 100
        reduction_rates.append(round(reduction_rate, 2))

    # Add to results
    results.update({f'N>{n}': count for n, count in enumerate(representatives_count[1:], 1)})
    results['kNN'] = knn_total
    results['Reduction Rate (%)'] = reduction_rates[-1]  # Use last reduction rate

    return results


# Main workflow
datasets = ["Glass", "Iris", "Heart", "Wine", "Diabetes", "Australia"]
final_results = []

for dataset in datasets:
    try:
        X, y = load_preprocess_data(dataset)
        results = evaluate_models(X, y, dataset)
        final_results.append(results)
    except ValueError as e:
        print(f"Error processing dataset {dataset}: {e}")

# Convert results to DataFrame and display
df_results = pd.DataFrame(final_results)
print(df_results)

     Dataset  N>1  N>2  N>3  N>4  N>5  kNN  Reduction Rate (%)
0      Glass  171  128   85   42    1  214               99.53
1       Iris  120   90   59   29    1  150               99.33
2      Heart  216  162  107   53    1  270               99.63
3       Wine  142  106   71   35    1  178               99.44
4   Diabetes  353  265  176   88    1  442               99.77
5  Australia  552  414  275  137    1  690               99.86


  warn(


In [14]:
#Table 4
from sklearn.datasets import fetch_openml, load_iris, load_wine, load_diabetes
from sklearn.preprocessing import MaxAbsScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.utils import check_array
import numpy as np
import pandas as pd

def load_preprocess_data(name):
    try:
        # Load specific datasets
        if name == "Iris":
            data = load_iris(as_frame=True)
        elif name == "Wine":
            data = load_wine(as_frame=True)
        elif name == "Diabetes":
            data = load_diabetes(as_frame=True)
        elif name == "Australia":
            data = fetch_openml(name="australian", version=2, as_frame=True)
        elif name == "Glass":
            data = fetch_openml(name="glass", version=1, as_frame=True)
        elif name == "Heart":
            data = fetch_openml(data_id=53, as_frame=True)  # Use correct data ID for Heart dataset
        else:
            raise ValueError("Dataset not available: " + name)

        X = check_array(data['data'], accept_sparse=True)
        y = np.array(data['target'])

        if X.size == 0 or y.size == 0:
            raise ValueError(f"Dataset {name} is empty or malformed.")

        scaler = MaxAbsScaler()
        X_scaled = scaler.fit_transform(X)
        return X_scaled, y

    except Exception as e:
        raise ValueError(f"Error loading dataset {name}: {e}")

# Dictionary to store accuracies for each (r, N) combination per dataset
accuracy_results = {}

def evaluate_models(X, y, dataset_name):
    global accuracy_results
    # Placeholder for best accuracy results
    best_accuracy = 0
    best_r = None
    best_N = None

    # Store accuracies for each (r, N) combination
    accuracy_results[dataset_name] = {}

    # Try different values of r and N
    for r in range(1, 6):  # example range for r
        for N in range(1, 6):  # example range for N
            # Use KNeighborsClassifier with specific values of N
            knn = KNeighborsClassifier(n_neighbors=N)

            # Perform cross-validation to estimate accuracy
            accuracy = cross_val_score(knn, X, y, cv=5).mean() * 100  # 5-fold cross-validation

            # Store the accuracy for this (r, N) combination
            accuracy_results[dataset_name][(r, N)] = accuracy

            # Update if this configuration gives a better accuracy
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_r = r
                best_N = N

    # Return the best results for this dataset
    return {
        'Dataset': dataset_name,
        'Best Accuracy': best_accuracy,
        'r': best_r,
        'N': best_N
    }

# Main workflow
datasets = ["Glass", "Iris", "Heart", "Wine", "Diabetes", "Australia"]
final_results = []

for dataset in datasets:
    try:
        X, y = load_preprocess_data(dataset)
        results = evaluate_models(X, y, dataset)
        final_results.append(results)
    except ValueError as e:
        print(f"Error processing dataset {dataset}: {e}")

# Convert results to DataFrame and display best results
df_results = pd.DataFrame(final_results)
print("Best Accuracy Results:")
print(df_results)

# Display accuracy for each (r, N) combination for each dataset
print("\nAccuracy for each (r, N) combination:")
for dataset, accuracies in accuracy_results.items():
    print(f"\nDataset: {dataset}")
    for (r, N), accuracy in accuracies.items():
        print(f"r = {r}, N = {N}: Accuracy = {accuracy:.2f}%")


  warn(


Best Accuracy Results:
     Dataset  Best Accuracy  r  N
0      Glass      68.228128  1  1
1       Iris      96.666667  1  4
2      Heart      80.370370  1  5
3       Wine      94.952381  1  2
4   Diabetes       1.131256  1  1
5  Australia      84.347826  1  4

Accuracy for each (r, N) combination:

Dataset: Glass
r = 1, N = 1: Accuracy = 68.23%
r = 1, N = 2: Accuracy = 66.36%
r = 1, N = 3: Accuracy = 64.51%
r = 1, N = 4: Accuracy = 64.51%
r = 1, N = 5: Accuracy = 64.01%
r = 2, N = 1: Accuracy = 68.23%
r = 2, N = 2: Accuracy = 66.36%
r = 2, N = 3: Accuracy = 64.51%
r = 2, N = 4: Accuracy = 64.51%
r = 2, N = 5: Accuracy = 64.01%
r = 3, N = 1: Accuracy = 68.23%
r = 3, N = 2: Accuracy = 66.36%
r = 3, N = 3: Accuracy = 64.51%
r = 3, N = 4: Accuracy = 64.51%
r = 3, N = 5: Accuracy = 64.01%
r = 4, N = 1: Accuracy = 68.23%
r = 4, N = 2: Accuracy = 66.36%
r = 4, N = 3: Accuracy = 64.51%
r = 4, N = 4: Accuracy = 64.51%
r = 4, N = 5: Accuracy = 64.01%
r = 5, N = 1: Accuracy = 68.23%
r = 5, N = 2