In [13]:
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output

method_settings = {}

# Helper function to replace outliers using different methods
def replace_outliers(data, method, k=None, feature_values=None):
    if method == 'Whisker':
        q1 = data.quantile(0.25)
        q3 = data.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        return data.clip(lower_bound, upper_bound), lower_bound, upper_bound
    elif method == '95 Percentile':
        lower_bound = data.quantile(0.025)
        upper_bound = data.quantile(0.975)
        return data.clip(lower_bound, upper_bound), lower_bound, upper_bound
    elif method == '99 Percentile':
        lower_bound = data.quantile(0.005)
        upper_bound = data.quantile(0.995)
        return data.clip(lower_bound, upper_bound), lower_bound, upper_bound
    elif method == 'KNN':
        from sklearn.neighbors import NearestNeighbors

        # Step 1: Detect outliers using robust thresholds (MAD-based)
        feature_values_cleaned = feature_values[~np.isnan(feature_values)]  # Exclude NaN values
        median = np.median(feature_values_cleaned)
        mad = np.median(np.abs(feature_values_cleaned - median))  # Median Absolute Deviation
        threshold = 1.5 * mad
        lower_limit = median - threshold
        upper_limit = median + threshold

        # Step 2: Identify outliers
        outlier_mask = (feature_values < lower_limit) | (feature_values > upper_limit)

        # Step 3: Exclude outliers and fit KNN only on valid data
        non_outlier_values = feature_values_cleaned[~((feature_values_cleaned < lower_limit) | (feature_values_cleaned > upper_limit))]
        knn = NearestNeighbors(n_neighbors=k)
        knn.fit(non_outlier_values.reshape(-1, 1))

        # Step 4: Replace outliers with the mean of their k-nearest neighbors
        replacement_values = feature_values.copy()
        for idx, value in enumerate(feature_values):
            if np.isnan(value):  # Keep missing values as NaN
                continue
            if outlier_mask[idx]:  # If the value is an outlier
                distances, indices = knn.kneighbors([[value]])
                replacement_values[idx] = np.mean(non_outlier_values[indices.flatten()])
            else:  # If not an outlier, keep the original value
                replacement_values[idx] = value

        # Step 5: Collect bounds for outlier replacements
        outlier_replacements = replacement_values[outlier_mask]
        lower_bound = outlier_replacements.min() if len(outlier_replacements) > 0 else None
        upper_bound = outlier_replacements.max() if len(outlier_replacements) > 0 else None

        return pd.Series(replacement_values, index=data.index), lower_bound, upper_bound

# Main function for the tool
def outlier_replacement_tool(df):
    selected_features = {"Whisker": [], "95 Percentile": [], "99 Percentile": [], "KNN": []}
    
    # Create a copy of the original dataset
    new_df = df.copy()

    # Buttons for methods
    whisker_button = widgets.Button(description='Whisker Method', tooltip='Replace outliers using IQR whisker values')
    percentile_95_button = widgets.Button(description='95 Percentile', tooltip='Replace outliers using 95th percentile bounds')
    percentile_99_button = widgets.Button(description='99 Percentile', tooltip='Replace outliers using 99th percentile bounds')
    knn_button = widgets.Button(description='KNN Method', tooltip='Replace outliers using K-Nearest Neighbors')
    implement_button = widgets.Button(description='Implement Methods', tooltip='Apply the selected methods to the dataset', button_style='success')

    buttons = [whisker_button, percentile_95_button, percentile_99_button, knn_button]

    # Output widgets
    output = widgets.Output()
    summary_output = widgets.Output()

    # Button highlighting
    def highlight_button(active_button):
        for button in buttons:
            button.button_style = ''
        active_button.button_style = 'primary'

    # Feature selection and method handling
    def handle_method_click(method):
        def callback(b):
            highlight_button(b)
            with output:
                clear_output()
                available_features = [col for col in df.columns if df[col].dtype in [np.float64, np.int64]]
                multi_select = widgets.SelectMultiple(
                    options=available_features,
                    description='Features:',
                    rows=10,
                    layout=widgets.Layout(width='50%')
                )

                if method == 'KNN':
                    k_slider = widgets.IntSlider(
                        value=5,
                        min=1,
                        max=20,
                        step=1,
                        description='K Value:',
                        style={'description_width': 'initial'}
                    )
                    # Store slider in global settings
                    method_settings['KNN_k_slider'] = k_slider

                confirm_button = widgets.Button(
                    description='Confirm Selection',
                    button_style='success',
                    tooltip=f'Confirm the selected features for {method}'
                )

                def confirm_callback(cb):
                    selected = list(multi_select.value)
                    # Check for conflicts across all methods
                    conflicts = [feat for feat in selected if any(feat in selected_features[m] for m in selected_features)]

                    if conflicts:
                        print(f"The following features have already been selected: {', '.join(conflicts)}.")
                        print("If you want to select these features in this method, please re-run the function.")
                    else:
                        selected_features[method].extend(selected)
                        print(f"Selected features for {method}: {', '.join(selected)}")

                        if method == 'KNN':
                            k_value = method_settings['KNN_k_slider'].value
                            print(f"Selected K value: {k_value}")


                confirm_button.on_click(confirm_callback)

                widgets_list = [multi_select, confirm_button]
                if method == 'KNN':
                    widgets_list.insert(1, k_slider)

                display(widgets.VBox(widgets_list))

        return callback

    # Assign callbacks to buttons
    whisker_button.on_click(handle_method_click("Whisker"))
    percentile_95_button.on_click(handle_method_click("95 Percentile"))
    percentile_99_button.on_click(handle_method_click("99 Percentile"))
    knn_button.on_click(handle_method_click("KNN"))

    # Implement button callback
    def on_implement_clicked(b):
        with summary_output:
            clear_output()
            log = []
            for method, features in selected_features.items():
                for feature in features:
                    if method in ["Whisker", "95 Percentile", "99 Percentile"]:
                        new_values, lower_bound, upper_bound = replace_outliers(new_df[feature], method)
                        new_df[feature] = new_values
                        log.append({
                            'Feature': feature,
                            'Method': method,
                            'Lower Bound': lower_bound,
                            'Upper Bound': upper_bound,
#                             'Details': f"Replaced using {method}"
                        })
                    elif method == 'KNN':
                        k_value = method_settings.get('KNN_k_slider', widgets.IntSlider(value=5)).value
                        feature_values = new_df[feature].values
                        new_values, lower_bound, upper_bound = replace_outliers(new_df[feature], method, k=k_value, feature_values=feature_values)
                        new_df[feature] = new_values
                        log.append({
                            'Feature': feature,
                            'Method': method,
                            'Lower Bound': lower_bound,
                            'Upper Bound': upper_bound,
#                             'Details': f"Replaced using {method}"
                        })

            log_df = pd.DataFrame(log)
            if not log_df.empty:
                log_df.to_csv('outlier_replacement_summary.csv', index=False)
                print("Outlier replacement summary saved as 'outlier_replacement_summary.csv'.")
                display(log_df)
            print("Outlier treatment completed.")

    implement_button.on_click(on_implement_clicked)

    # Display widgets
    display(widgets.VBox([
        widgets.HBox([whisker_button, percentile_95_button, percentile_99_button, knn_button]),
        implement_button,
        output,
        summary_output
    ]))

    return new_df


In [2]:
import pandas as pd
import numpy as np

def generate_test_dataset():
    np.random.seed(42)  # For reproducibility

    # Generate continuous features with some random outliers
    continuous_features = {
        f"Cont_{i}": np.random.normal(loc=50, scale=10, size=100).tolist() for i in range(1, 20)
    }
    # Add outliers to some of the continuous features
    continuous_features["Cont_3"][::10] = [150] * 10  # Outliers in every 10th row
    continuous_features["Cont_1"][::10] = [180] * 10   # Outliers in every 15th row
    continuous_features["Cont_6"][::15] = [-70] * 7   # Outliers in every 15th row
    continuous_features["Cont_7"][::15] = [-50] * 7   # Outliers in every 15th row

    # Generate categorical features
    categorical_features = {
        f"Cat_{i}": np.random.choice(["A", "B", "C", "D"], size=100).tolist() for i in range(1, 11)
    }

    # Combine continuous and categorical features
    dataset = {**continuous_features, **categorical_features}
    df = pd.DataFrame(dataset)

    return df

# Generate the dataset
test_dataset = generate_test_dataset()


In [26]:
test_dataset = pd.DataFrame({'cont1':[-50, 2, 3, 100, 5, 6, 7, np.nan]})

In [27]:
new_df = outlier_replacement_tool(test_dataset)

VBox(children=(HBox(children=(Button(description='Whisker Method', style=ButtonStyle(), tooltip='Replace outli…

In [25]:
new_df

Unnamed: 0,cont1
0,3.333333
1,2.0
2,3.0
3,6.0
4,5.0
5,6.0
6,7.0
7,
