In [82]:
from scipy.stats import zscore
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import NearestNeighbors
import json
from functions import *
import os

In [83]:
def detect_outliers_in_bead(bead_points, threshold=2):
    """Detect outliers within a bead using Z-score method."""
    # Convert bead_points to a NumPy array to handle numeric calculations properly
    bead_points = np.array(bead_points)

    # Compute Z-scores for all points in the bead
    z_scores = np.abs(zscore(bead_points, axis=0))

    # Identify points where any feature has a Z-score greater than the threshold
    outlier_mask = np.any(z_scores > threshold, axis=1)

    # Separate outliers and inliers
    inliers = bead_points[~outlier_mask]
    outliers = bead_points[outlier_mask]

    return inliers, outliers

In [84]:
import numpy as np
import json


def convert_ndarray_to_list(obj):
    """Recursively convert NumPy arrays to lists."""
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_ndarray_to_list(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_ndarray_to_list(item) for item in obj]
    else:
        return obj


def implement_kmeans(file_path, k, num_beads, output_path, X):
    features = X[:, :-1]
    labels = X[:, -1]
    y_kmeans, centers = apply_kmeans(features, k)
    cluster_points = store_cluster(features, y_kmeans, k)
    all_beads = store_and_print_beads(cluster_points, num_beads)

    output_data = []
    data_dimension = features.shape[1]
    output_data.append({"data_dimension": data_dimension})

    for i, (cluster_beads, b_center) in enumerate(all_beads):
        bead_analysis_results = analyze_beads([(cluster_beads, b_center)])
        cluster_center = centers[i]
        cluster_data = {
            "cluster_number": i + 1,
            "cluster_center": cluster_center.tolist(),
            "beads": [],
        }
        print(f"Cluster {i + 1} Beads:")

        bead_number = 1  # Initialize bead number counter

        for j, result in enumerate(bead_analysis_results[0]):
            best_p, best_norm = result
            bead_center = np.mean(cluster_beads[j], axis=0)

            # Detect outliers within the bead
            inliers, outliers = detect_outliers_in_bead(cluster_beads[j])

            # Handle the inliers as part of the main bead
            bead_info = {
                "bead_number": bead_number,
                "best_p": best_p,
                "lp_norm": best_norm,
                "bead_center": bead_center.tolist(),
                "data_points": [],
            }

            for point in inliers:
                point_index = np.where((features == point).all(axis=1))[0][0]
                point_label = labels[point_index]
                bead_info["data_points"].append(
                    {
                        "coordinates": point.tolist(),
                        "label": int(point_label),
                    }
                )

            cluster_data["beads"].append(bead_info)
            bead_number += 1  # Increment bead number for next bead

            # Treat outliers as a separate bead but calculate best_p, lp_norm, and bead center
            if len(outliers) > 0:
                # Calculate p-norm and bead center for outlier bead
                outlier_bead_center = np.mean(outliers, axis=0)
                outlier_best_p, outlier_best_norm = calculate_and_find_best_p(outliers)

                outlier_bead_info = {
                    "bead_number": bead_number,
                    "best_p": outlier_best_p,
                    "lp_norm": outlier_best_norm,
                    "bead_center": outlier_bead_center.tolist(),
                    "data_points": [],
                }

                for point in outliers:
                    point_index = np.where((features == point).all(axis=1))[0][0]
                    point_label = labels[point_index]
                    outlier_bead_info["data_points"].append(
                        {
                            "coordinates": point.tolist(),
                            "label": int(point_label),
                        }
                    )

                cluster_data["beads"].append(outlier_bead_info)
                bead_number += 1  # Increment bead number for next outlier bead
                print(
                    f"  Outlier bead treated as normal with {len(outliers)} points, Best p = {outlier_best_p}, Best l_p norm = {outlier_best_norm}"
                )

        output_data.append(cluster_data)

    # Recursively convert NumPy arrays in output_data to lists before writing to JSON
    output_data = convert_ndarray_to_list(output_data)

    with open(output_path, "w") as json_file:
        json.dump(output_data, json_file, indent=4)

    return output_data

In [85]:
def file_dataset(file_path):
    # Load data
    data = pd.read_csv(file_path)

    # Drop the 'Id' column if it exists
    if "Id" in data.columns:
        data = data.drop(columns=["Id"])

    # Identify numeric columns
    numeric_columns = data.select_dtypes(include=[np.number]).columns

    # Fill missing values with the mean
    data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

    # Encode categorical variables
    label_encoders = {}
    for column in data.select_dtypes(include=["object"]):
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])

    # Apply to_numeric to all columns
    data = data.apply(pd.to_numeric, errors="ignore")

    def custom_scale(series):
        # Separate positive and negative values
        pos = series[series >= 0]
        neg = series[series < 0]

        # Min-Max scale positive values to [0, 1]
        if len(pos) > 0:
            pos = (pos - pos.min()) / (pos.max() - pos.min())

        # Min-Max scale negative values to [-1, 0]
        if len(neg) > 0:
            neg = (neg - neg.max()) / (neg.min() - neg.max())

        # Combine the scaled values
        return pd.concat([pos, neg]).reindex(series.index)

    # Normalize numeric columns
    data[numeric_columns] = data[numeric_columns].apply(custom_scale)

    return data.values  # Return array of arrays


In [86]:
def implement_cure(file_path, k, num_beads, output_path, X):
    features = X[:, :-1]
    labels = X[:, -1]
    y_kmeans, centers = apply_kmeans(features, k)
    cluster_points = store_cluster(features, y_kmeans, k)
    all_beads = store_and_print_beads(cluster_points, num_beads)
    output_data = []
    data_dimension = features.shape[1]
    output_data.append({"data_dimension": data_dimension})

    representatives = int(input("number of representative points per bead: "))
    cured_X = cureBeads(all_beads, representatives)

    for i, (cluster_beads, b_center) in enumerate(cured_X):
        bead_analysis_results = analyze_beads([(cluster_beads, b_center)])
        cluster_center = centers[i]
        cluster_data = {
            "cluster_number": i + 1,
            "cluster_center": cluster_center.tolist(),
            "beads": [],
        }
        print(f"Cluster {i + 1} Beads:")

        # Use nearest neighbors to find the closest points
        nbrs = NearestNeighbors(n_neighbors=1).fit(features)

        for j, result in enumerate(bead_analysis_results[0]):
            best_p, best_norm = result
            bead_center = np.mean(cluster_beads[j], axis=0)
            bead_info = {
                "bead_number": j + 1,
                "best_p": best_p,
                "lp_norm": best_norm,
                "bead_center": bead_center.tolist(),
                "data_points": [],
            }
            for point in cluster_beads[j]:
                distances, indices = nbrs.kneighbors([point])
                nearest_index = indices[0][0]
                point_label = labels[nearest_index]
                bead_info["data_points"].append(
                    {
                        "coordinates": features[nearest_index].tolist(),
                        "label": int(point_label),
                    }
                )
            cluster_data["beads"].append(bead_info)
            print(f"  Bead {j + 1}: Best p = {best_p}, Best l_p norm = {best_norm}")
        output_data.append(cluster_data)

    with open(output_path, "w") as json_file:
        json.dump(output_data, json_file, indent=4)

    return output_data

In [87]:
def check_and_convert_excel(file_path):
    """Check if the input file is an Excel file and convert it to CSV if needed."""
    if file_path.endswith(".xls") or file_path.endswith(".xlsx"):
        try:
            # Specify the engine manually to avoid ValueError
            excel_data = pd.read_excel(file_path, engine="openpyxl")
        except ValueError:
            # Try using a different engine if the first fails
            excel_data = pd.read_excel(file_path, engine="xlrd")
        csv_path = file_path.replace(".xlsx", ".csv").replace(".xls", ".csv")
        excel_data.to_csv(csv_path, index=False)
        return csv_path
    return file_path

In [88]:
if __name__ == "__main__":
    # file_path = input("Enter the path to the CSV file: ")
    # k = int(input("Enter the number of clusters (k): "))
    # num_beads = int(input("Enter the number of beads per cluster: "))
    # output_path = input("Enter the output path for the clusters and beads JSON: ")
    # file_path = "/home/bipasha/Desktop/research/Data_Viz_Beads/CODE/old/Iris.csv"\
    # file_path = "/home/bipasha/Desktop/research/Data_Viz_Beads/CODE/new/dataset/Customers.csv"
    # file_path = "/home/bipasha/Desktop/research/Data_Viz_Beads/CODE/new/dataset/diabetes.csv"
    file_path = "/home/bipasha/Desktop/research/Data_Viz_Beads/CODE/dataset/Iris.csv"
    # file_path = "/home/bipasha/Desktop/research/Data_Viz_Beads/CODE/new/dataset/User Knowledge.xls"
    # file_path = check_and_convert_excel(file_path)
    k = 4
    num_beads = 6
    output_path = "2.json"
    cure = str(input("Do you want to apply cure? (y/n):"))
    X = file_dataset(file_path)
    # print(X)
    if cure == "y":
        implement_cure(file_path, k, num_beads, output_path, X)
    else:
        implement_kmeans(file_path, k, num_beads, output_path, X)

Cluster 1 Beads:
  Outlier bead treated as normal with 1 points, Best p = 5.0, Best l_p norm = (5.0, np.float64(0.0), array([0.47222222, 0.08333333, 0.50847458, 0.375     ]))
  Outlier bead treated as normal with 3 points, Best p = 5.0, Best l_p norm = (5.0, np.float64(0.08402838605557557), array([0.41666667, 0.29166667, 0.52542373, 0.375     ]))
Cluster 2 Beads:
  Outlier bead treated as normal with 3 points, Best p = 5.0, Best l_p norm = (5.0, np.float64(0.11141283173268401), array([0.19444444, 0.625     , 0.10169492, 0.20833333]))
  Outlier bead treated as normal with 2 points, Best p = 5.0, Best l_p norm = (5.0, np.float64(0.07057816132614043), array([0.        , 0.41666667, 0.01694915, 0.        ]))
  Outlier bead treated as normal with 2 points, Best p = 5.0, Best l_p norm = (5.0, np.float64(0.07213130481490049), array([0.25      , 0.875     , 0.08474576, 0.        ]))
Cluster 3 Beads:
  Outlier bead treated as normal with 1 points, Best p = 5.0, Best l_p norm = (5.0, np.float64(

  data = data.apply(pd.to_numeric, errors="ignore")


TypeError: Object of type ndarray is not JSON serializable