## 1. Load Required Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from collections import defaultdict
import logging
import matplotlib.pyplot as plt


## 2. Define Distance Calculation Function
This function computes the distance matrix between train and test sets using either cosine similarity or Euclidean distance.

In [2]:
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


def calculate_distance_matrix(X_train, X_test, metric="cosine"):
    """
    Calculate the distance matrix between training and testing sets using the specified metric.

    Args:
        X_train (np.ndarray): Training data.
        X_test (np.ndarray): Test data.
        metric (str): Distance metric ("cosine" or "euclidean").

    Returns:
        np.ndarray: Distance matrix of shape (len(X_test), len(X_train)).
    """
    if metric == "cosine":
        return cosine_distances(X_test, X_train)
    elif metric == "euclidean":
        return euclidean_distances(X_test, X_train)
    else:
        raise ValueError("Unsupported metric. Use 'cosine' or 'euclidean'.")


## 3. k-NN Prediction
This function predicts labels for the test set by finding the k-nearest neighbors and performing majority voting.

In [3]:
def knn_predict(distance_matrix, y_train, k):
    """
    Predict labels for the test set using k-NN with majority voting.

    Args:
        distance_matrix (np.ndarray): Distance matrix between test and train samples.
        y_train (np.ndarray): Training labels.
        k (int): Number of nearest neighbors.

    Returns:
        np.ndarray: Predicted labels for the test set.
    """
    predictions = []
    for distances in distance_matrix:
        neighbors_idx = np.argsort(distances)[:k]
        neighbors_labels = y_train[neighbors_idx]
        predicted_label = np.bincount(neighbors_labels.astype(int)).argmax()
        predictions.append(predicted_label)
    return np.array(predictions)


## 4. Performance Metrics
This function manually calculates precision, recall, F1-Score, and true/false positives/negatives for each class.


In [4]:
def calculate_class_metrics(y_true, y_pred, classes):
    """
    Calculate precision, recall, F1-score, and accuracy for each class and overall metrics.

    Args:
        y_true (np.ndarray): Ground-truth labels.
        y_pred (np.ndarray): Predicted labels.
        classes (np.ndarray): Unique class labels.

    Returns:
        dict: Metrics for each class and macro/micro averages.
    """
    metrics = {cls: {} for cls in classes}
    total_tp, total_fp, total_fn, total_correct = 0, 0, 0, 0

    for cls in classes:
        tp = sum((y_true == cls) & (y_pred == cls))
        fp = sum((y_true != cls) & (y_pred == cls))
        fn = sum((y_true == cls) & (y_pred != cls))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

        metrics[cls] = {
            "Precision": precision,
            "Recall": recall,
            "F1": f1,
            "TP": tp,
            "FP": fp,
            "FN": fn,
        }

        total_tp += tp
        total_fp += fp
        total_fn += fn
        total_correct += tp

    accuracy = total_correct / len(y_true)
    macro_f1 = np.mean([metrics[cls]["F1"] for cls in classes])
    metrics["Macro Average"] = {"F1": macro_f1}
    metrics["Accuracy"] = accuracy

    return metrics


## 5. k-NN with Cross-Validation
Perform stratified 10-fold cross-validation and evaluate k-NN.

In [5]:
def knn_with_cross_validation(X, y, k_values, metric="cosine", n_splits=10):
    """
    Perform k-NN with stratified cross-validation and evaluate performance.

    Args:
        X (np.ndarray): Feature matrix.
        y (np.ndarray): Labels.
        k_values (list): List of k values to evaluate.
        metric (str): Distance metric ("cosine" or "euclidean").
        n_splits (int): Number of cross-validation folds.

    Returns:
        dict: Results dictionary containing performance metrics for each k.
    """
    results = {}
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    classes = np.unique(y)

    for k in k_values:
        fold_accuracies = []
        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            distance_matrix = calculate_distance_matrix(X_train, X_test, metric)
            y_pred = knn_predict(distance_matrix, y_train, k)

            metrics = calculate_class_metrics(y_test, y_pred, classes)
            fold_accuracies.append(metrics["Accuracy"])

        results[k] = {"Accuracy": np.mean(fold_accuracies)}

    return results

## 6. Run k-NN Classification
Set up data paths, load the dataset, and evaluate performance.

In [6]:
def plot_knn_results(results, k_values):
    """
    Plot k-NN performance metrics across k values.

    Args:
        results (dict): Results dictionary from cross-validation.
        k_values (list): List of k values.
    """
    accuracies = [results[k]["Accuracy"] for k in k_values]
    plt.figure(figsize=(10, 6))
    plt.plot(k_values, accuracies, marker="o", label="Accuracy")
    plt.xlabel("k (Number of Neighbors)")
    plt.ylabel("Accuracy")
    plt.title("k-NN Performance Across k Values")
    plt.legend()
    plt.grid()
    plt.show()


In [9]:
def cosine_similarity(vector1, vector2):
    """
    Calculate cosine similarity between two vectors.

    Args:
        vector1 (np.ndarray): First vector.
        vector2 (np.ndarray): Second vector.

    Returns:
        float: Cosine similarity score.
    """
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)

    if norm_vector1 == 0 or norm_vector2 == 0:
        return 0.0

    return dot_product / (norm_vector1 * norm_vector2)


def calculate_performance_metrics(actual_labels, predicted_labels):
    """
    Calculate precision, recall, and F1-Score for each class.

    Args:
        actual_labels (np.ndarray): Ground-truth labels.
        predicted_labels (np.ndarray): Predicted labels.

    Returns:
        dict: Metrics for each class.
    """
    classes = np.unique(actual_labels)
    metrics = {}

    for document_class in classes:
        tp = np.sum((actual_labels == document_class) & (predicted_labels == document_class))
        fp = np.sum((actual_labels != document_class) & (predicted_labels == document_class))
        fn = np.sum((actual_labels == document_class) & (predicted_labels != document_class))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        metrics[document_class] = {
            'True Positives': tp,
            'False Positives': fp,
            'Recall': recall,
            'F1-Score': f1_score
        }

    return metrics


def k_fold(X, y, n_splits=10):
    """
    Perform stratified k-fold split.

    Args:
        X (np.ndarray): Feature matrix.
        y (np.ndarray): Labels.
        n_splits (int): Number of folds.

    Returns:
        list: Training and testing splits.
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    splits = [(train_idx, test_idx) for train_idx, test_idx in skf.split(X, y)]
    return splits


def predict(train_data, train_labels, test_point, k, metric="cosine"):
    """
    Predict the class of a single test point using k-NN.

    Args:
        train_data (np.ndarray): Training data.
        train_labels (np.ndarray): Training labels.
        test_point (np.ndarray): Test data point.
        k (int): Number of neighbors.
        metric (str): Distance metric ("cosine" or "euclidean").

    Returns:
        int: Predicted class label.
    """
    if metric == "cosine":
        similarities = [(cosine_similarity(test_point, train_vec), label)
                        for train_vec, label in zip(train_data, train_labels)]
    elif metric == "euclidean":
        similarities = [(np.linalg.norm(test_point - train_vec), label)
                        for train_vec, label in zip(train_data, train_labels)]
    else:
        raise ValueError("Unsupported metric. Use 'cosine' or 'euclidean'.")

    # Sort by distance/similarity and select k neighbors
    similarities.sort(reverse=(metric == "cosine"))
    k_neighbors = similarities[:k]
    labels = [label for _, label in k_neighbors]

    # Majority voting
    return max(set(labels), key=labels.count)


def train_and_evaluate(X, y, k_values, metric="cosine", n_splits=10):
    """
    Train and evaluate k-NN with cross-validation for different k values.

    Args:
        X (np.ndarray): Feature matrix.
        y (np.ndarray): Labels.
        k_values (list): List of k values to evaluate.
        metric (str): Distance metric ("cosine" or "euclidean").
        n_splits (int): Number of folds.

    Returns:
        dict: Results dictionary containing metrics for each k value.
    """
    splits = k_fold(X, y, n_splits)
    results = {}

    for k in k_values:
        fold_accuracies = []
        fold_metrics = []

        logging.info(f"Evaluating k={k} with metric={metric}")

        for train_idx, test_idx in splits:
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            predictions = [predict(X_train, y_train, test_point, k, metric) for test_point in X_test]

            # Calculate accuracy
            accuracy = np.mean(predictions == y_test)
            fold_accuracies.append(accuracy)

            # Calculate metrics
            fold_metrics.append(calculate_performance_metrics(y_test, np.array(predictions)))

        avg_accuracy = np.mean(fold_accuracies)
        results[k] = {
            "avg_accuracy": avg_accuracy,
            "fold_metrics": fold_metrics
        }

    return results


def get_best_k(results):
    """
    Find the best k based on average accuracy.

    Args:
        results (dict): Results dictionary from train_and_evaluate.

    Returns:
        dict: Best k value and corresponding metrics.
    """
    best_k = max(results, key=lambda k: results[k]["avg_accuracy"])
    return {
        "best_k": best_k,
        "best_accuracy": results[best_k]["avg_accuracy"],
        "detailed_metrics": results[best_k]["fold_metrics"]
    }


## 7. View Results

In [None]:
# Load dataset
tfidf_path = "../reports/tfidf_values.csv"
tfidf_data = pd.read_csv(tfidf_path)

X = tfidf_data.iloc[:, :-1].values
y = tfidf_data.iloc[:, -1].values

assert X.shape[0] == len(y), "Feature matrix and labels must have the same number of samples!"

# Define parameters
k_values = [1, 3, 5, 7, 9]
metrics = ["cosine", "euclidean"]

# Evaluate k-NN for each metric
for metric in metrics:
    logging.info(f"Evaluating k-NN with {metric} metric")
    results = train_and_evaluate(X, y, k_values, metric)

    # Get best k
    best_result = get_best_k(results)
    logging.info(f"Best k for {metric}: {best_result['best_k']} with accuracy: {best_result['best_accuracy']:.2f}")

    # Print detailed metrics for best k
    logging.info(f"Detailed metrics for k={best_result['best_k']}:\n{best_result['detailed_metrics']}")



IndentationError: unexpected indent (2795141044.py, line 2)