In [1]:
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output

method_settings = {}

# Helper function to replace outliers using different methods
def replace_outliers(data, method, k=None, feature_values=None):
    if method == 'Whisker':
        q1 = data.quantile(0.25)
        q3 = data.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        return data.clip(lower_bound, upper_bound), lower_bound, upper_bound
    elif method == '95 Percentile':
        lower_bound = data.quantile(0.025)
        upper_bound = data.quantile(0.975)
        return data.clip(lower_bound, upper_bound), lower_bound, upper_bound
    elif method == '99 Percentile':
        lower_bound = data.quantile(0.005)
        upper_bound = data.quantile(0.995)
        return data.clip(lower_bound, upper_bound), lower_bound, upper_bound
    elif method == 'KNN':
        from sklearn.neighbors import NearestNeighbors

        threshold = 2  # z-score threshold for the outlier detection

        feature_values_cleaned = feature_values[~np.isnan(feature_values)]  # Exclude NaN values

        # Step 1: KNN outlier detection
        knn = NearestNeighbors(n_neighbors=k)
        knn.fit(feature_values_cleaned.reshape(-1, 1))

        # Step 2: Calculate distances to k neighbors
        distances, _ = knn.kneighbors(feature_values_cleaned.reshape(-1, 1))
        avg_distances = distances.mean(axis=1)

        # Step 3: Determine outliers based on threshold
        z_scores = (avg_distances - avg_distances.mean()) / avg_distances.std()
        outlier_mask = np.abs(z_scores) > threshold

        # Step 4: Determine lower and upper bound
        outlier_data = []
        non_outlier_data = []
        for idx, value in enumerate(feature_values_cleaned):
            if outlier_mask[idx]:
                outlier_data.append(value)
            else:
                non_outlier_data.append(value)

        lower_bound_outlier = [x for x in outlier_data if x < min(non_outlier_data)]
        upper_bound_outlier = [x for x in outlier_data if x > max(non_outlier_data)]

        min_threshold = (
            (max(lower_bound_outlier) + min(non_outlier_data)) / 2
            if lower_bound_outlier
            else None
        )
        max_threshold = (
            (min(upper_bound_outlier) + max(non_outlier_data)) / 2
            if upper_bound_outlier
            else None
        )

        # Replace None or NaN thresholds with feature min/max values
        if min_threshold is None or np.isnan(min_threshold):
            min_threshold = min(non_outlier_data)
        if max_threshold is None or np.isnan(max_threshold):
            max_threshold = max(non_outlier_data)

        # Step 5: Replacing outlier values
        feature_values = np.vectorize(
            lambda x: min_threshold
            if min_threshold and x < min_threshold
            else max_threshold
            if max_threshold and x > max_threshold
            else x
        )(feature_values)

        return pd.Series(feature_values, index=data.index), min_threshold, max_threshold

# Main function for the tool
def outlier_replacement_tool(df):
    selected_features = {"Whisker": [], "95 Percentile": [], "99 Percentile": [], "KNN": []}

    # Create a copy of the original dataset
    new_df = df.copy()

    # Buttons for methods
    whisker_button = widgets.Button(description='Whisker Method', tooltip='Replace outliers using IQR whisker values')
    percentile_95_button = widgets.Button(description='95 Percentile', tooltip='Replace outliers using 95th percentile bounds')
    percentile_99_button = widgets.Button(description='99 Percentile', tooltip='Replace outliers using 99th percentile bounds')
    knn_button = widgets.Button(description='KNN Method', tooltip='Replace outliers using K-Nearest Neighbors')
    implement_button = widgets.Button(description='Implement Methods', tooltip='Apply the selected methods to the dataset', button_style='success')

    buttons = [whisker_button, percentile_95_button, percentile_99_button, knn_button]

    # Output widgets
    output = widgets.Output()
    summary_output = widgets.Output()

    # Button highlighting
    def highlight_button(active_button):
        for button in buttons:
            button.button_style = ''
        active_button.button_style = 'primary'

    # Feature selection and method handling
    def handle_method_click(method):
        def callback(b):
            highlight_button(b)
            with output:
                clear_output()
                available_features = [col for col in df.columns if df[col].dtype in [np.float64, np.int64]]
                multi_select = widgets.SelectMultiple(
                    options=available_features,
                    description='Features:',
                    rows=10,
                    layout=widgets.Layout(width='50%')
                )

                if method == 'KNN':
                    k_slider = widgets.IntSlider(
                        value=5,
                        min=1,
                        max=20,
                        step=1,
                        description='K Value:',
                        style={'description_width': 'initial'}
                    )
                    # Store slider in global settings
                    method_settings['KNN_k_slider'] = k_slider

                confirm_button = widgets.Button(
                    description='Confirm Selection',
                    button_style='success',
                    tooltip=f'Confirm the selected features for {method}'
                )

                def confirm_callback(cb):
                    selected = list(multi_select.value)
                    # Check for conflicts across all methods
                    conflicts = [feat for feat in selected if any(feat in selected_features[m] for m in selected_features)]

                    if conflicts:
                        print(f"The following features have already been selected: {', '.join(conflicts)}.")
                        print("If you want to select these features in this method, please re-run the function.")
                    else:
                        selected_features[method].extend(selected)
                        print(f"Selected features for {method}: {', '.join(selected)}")

                        if method == 'KNN':
                            k_value = method_settings['KNN_k_slider'].value
                            print(f"Selected K value: {k_value}")
                            print("Changing the K value can change the outlier values.")

                confirm_button.on_click(confirm_callback)

                widgets_list = [multi_select, confirm_button]
                if method == 'KNN':
                    widgets_list.insert(1, k_slider)

                display(widgets.VBox(widgets_list))

        return callback

    # Assign callbacks to buttons
    whisker_button.on_click(handle_method_click("Whisker"))
    percentile_95_button.on_click(handle_method_click("95 Percentile"))
    percentile_99_button.on_click(handle_method_click("99 Percentile"))
    knn_button.on_click(handle_method_click("KNN"))

    # Implement button callback
    def on_implement_clicked(b):
        with summary_output:
            clear_output()
            log = []
            for method, features in selected_features.items():
                for feature in features:
                    if method in ["Whisker", "95 Percentile", "99 Percentile"]:
                        new_values, lower_bound, upper_bound = replace_outliers(new_df[feature], method)
                        new_df[feature] = new_values
                        log.append({
                            'Feature': feature,
                            'Method': method,
                            'Lower Bound': lower_bound,
                            'Upper Bound': upper_bound,
                        })
                    elif method == 'KNN':
                        k_value = method_settings.get('KNN_k_slider', widgets.IntSlider(value=5)).value
                        feature_values = new_df[feature].values
                        new_values, lower_bound, upper_bound = replace_outliers(new_df[feature], method, k=k_value, feature_values=feature_values)
                        new_df[feature] = new_values
                        log.append({
                            'Feature': feature,
                            'Method': method,
                            'Lower Bound': lower_bound,
                            'Upper Bound': upper_bound,
                        })

            log_df = pd.DataFrame(log)
            if not log_df.empty:
                log_df.to_csv('outlier_replacement_summary.csv', index=False)
                print("Outlier replacement summary saved as 'outlier_replacement_summary.csv'.")
                display(log_df)
            print("Outlier treatment completed.")

    implement_button.on_click(on_implement_clicked)

    # Display widgets
    display(widgets.VBox([
        widgets.HBox([whisker_button, percentile_95_button, percentile_99_button, knn_button]),
        output,
        implement_button,
        summary_output
    ]))

    return new_df



In [2]:
import pandas as pd
import numpy as np

def generate_test_dataset():
    np.random.seed(42)  # For reproducibility

    # Generate continuous features with some random outliers
    continuous_features = {
        f"Cont_{i}": np.random.normal(loc=50, scale=10, size=100).tolist() for i in range(1, 20)
    }
    # Add outliers to some of the continuous features
    continuous_features["Cont_3"][::10] = [150] * 10  # Outliers in every 10th row
    continuous_features["Cont_1"][::10] = [180] * 10   # Outliers in every 15th row
    continuous_features["Cont_6"][::15] = [-70] * 7   # Outliers in every 15th row
    continuous_features["Cont_7"][::15] = [-50] * 7   # Outliers in every 15th row

    # Generate categorical features
    categorical_features = {
        f"Cat_{i}": np.random.choice(["A", "B", "C", "D"], size=100).tolist() for i in range(1, 11)
    }

    # Combine continuous and categorical features
    dataset = {**continuous_features, **categorical_features}
    df = pd.DataFrame(dataset)

    return df

# Generate the dataset
test_dataset = generate_test_dataset()


In [3]:
new_df = outlier_replacement_tool(test_dataset)

VBox(children=(HBox(children=(Button(description='Whisker Method', style=ButtonStyle(), tooltip='Replace outli…

In [4]:
new_df

Unnamed: 0,Cont_1,Cont_2,Cont_3,Cont_4,Cont_5,Cont_6,Cont_7,Cont_8,Cont_9,Cont_10,...,Cat_1,Cat_2,Cat_3,Cat_4,Cat_5,Cat_6,Cat_7,Cat_8,Cat_9,Cat_10
0,79.129707,35.846293,150.000000,41.710050,34.055723,-70.000000,-13.484433,44.772770,59.382838,53.686733,...,A,A,D,A,A,C,C,B,D,C
1,48.617357,45.793547,55.607845,44.398190,44.006250,69.094166,40.778347,60.490092,44.839553,46.066612,...,A,B,B,A,A,A,A,C,D,B
2,56.476885,46.572855,60.830512,57.472936,50.052437,36.014324,58.696059,42.956563,50.961208,50.287448,...,B,C,A,A,B,A,A,B,C,D
3,65.230299,41.977227,60.538021,56.103703,50.469806,55.629692,63.556379,35.915387,45.377247,62.784519,...,D,C,B,C,C,A,A,B,B,A
4,47.658466,48.387143,36.223306,49.790984,45.499345,43.493574,54.134349,34.433708,45.655038,51.910991,...,A,D,C,A,B,B,B,A,A,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,35.364851,53.853174,43.070904,45.308243,55.389100,44.899836,46.904536,60.531529,56.487099,47.188997,...,B,D,C,B,A,A,B,B,B,A
96,52.961203,41.161426,58.995999,32.868655,39.627538,47.301251,53.261330,49.604448,48.328819,67.976865,...,A,B,A,D,D,B,B,D,D,B
97,52.610553,51.537251,53.072995,63.538724,48.096613,40.212363,37.488864,56.815007,51.467137,56.408429,...,C,A,A,D,A,C,B,D,C,D
98,50.051135,50.582087,58.128621,48.854602,41.243817,45.557067,59.240270,50.283184,62.065090,44.288210,...,D,D,A,D,D,D,A,B,B,C
