In [1]:
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output

# Helper function to replace outliers using different methods
def replace_outliers(data, method):
    if method == 'Whisker':
        q1 = data.quantile(0.25)
        q3 = data.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        return data.clip(lower_bound, upper_bound)
    elif method == '95 Percentile':
        lower_bound = data.quantile(0.025)
        upper_bound = data.quantile(0.975)
        return data.clip(lower_bound, upper_bound)
    elif method == '99 Percentile':
        lower_bound = data.quantile(0.005)
        upper_bound = data.quantile(0.995)
        return data.clip(lower_bound, upper_bound)

# Main function for the tool
def outlier_replacement_tool(df):
    selected_features = {"Whisker": [], "95 Percentile": [], "99 Percentile": []}
    
    # Create a copy of the original dataset
    new_df = df.copy()

    # Buttons for methods
    whisker_button = widgets.Button(
        description='Whisker Method',
        tooltip='Replace outliers using IQR whisker values',
        button_style='primary'
    )
    percentile_95_button = widgets.Button(
        description='95 Percentile',
        tooltip='Replace outliers using 95th percentile bounds',
        button_style='primary'
    )
    percentile_99_button = widgets.Button(
        description='99 Percentile',
        tooltip='Replace outliers using 99th percentile bounds',
        button_style='primary'
    )
    
    # Final implement button
    implement_button = widgets.Button(
        description='Implement Methods',
        tooltip='Apply the selected methods to the dataset',
        button_style='success'
    )

    # Output widgets
    output = widgets.Output()
    summary_output = widgets.Output()

    # Feature selection and method handling
    def handle_method_click(method):
        def callback(b):
            with output:
                clear_output()
                available_features = [col for col in df.columns if df[col].dtype in [np.float64, np.int64]]
                multi_select = widgets.SelectMultiple(
                    options=available_features,
                    description='Features:',
                    rows=10,
                    layout=widgets.Layout(width='50%')
                )
                
                confirm_button = widgets.Button(
                    description='Confirm Selection',
                    button_style='success',
                    tooltip=f'Confirm the selected features for {method}'
                )

                def confirm_callback(cb):
                    selected = list(multi_select.value)
                    conflicts = [feat for feat in selected if feat in selected_features["Whisker"] or \
                                 feat in selected_features["95 Percentile"] or \
                                 feat in selected_features["99 Percentile"]]

                    if conflicts:
                        print(f"The following features have already been selected in another method: {', '.join(conflicts)}")
                    else:
                        selected_features[method].extend(selected)
                        print(f"Selected features for {method}: {', '.join(selected)}")

                confirm_button.on_click(confirm_callback)

                display(widgets.VBox([multi_select, confirm_button]))
        
        return callback

    # Assign callbacks to buttons
    whisker_button.on_click(handle_method_click("Whisker"))
    percentile_95_button.on_click(handle_method_click("95 Percentile"))
    percentile_99_button.on_click(handle_method_click("99 Percentile"))

    # Implement button callback
    def on_implement_clicked(b):
        with summary_output:
            clear_output()
            log = []
            for method, features in selected_features.items():
                for feature in features:
                    # Calculate bounds for outlier replacement
                    if method == 'Whisker':
                        q1 = new_df[feature].quantile(0.25)
                        q3 = new_df[feature].quantile(0.75)
                        iqr = q3 - q1
                        lower_bound = q1 - 1.5 * iqr
                        upper_bound = q3 + 1.5 * iqr
                    elif method == '95 Percentile':
                        lower_bound = new_df[feature].quantile(0.025)
                        upper_bound = new_df[feature].quantile(0.975)
                    elif method == '99 Percentile':
                        lower_bound = new_df[feature].quantile(0.005)
                        upper_bound = new_df[feature].quantile(0.995)

                    # Apply the replacement to the feature
                    original_values = new_df[feature].copy()
                    new_df[feature] = replace_outliers(new_df[feature], method)

                    # Log the feature, method, and replacement values
                    if (original_values < lower_bound).any():
                        log.append({
                            'Feature': feature,
                            'Method': method,
                            'Replacement Value': lower_bound
                        })
                    if (original_values > upper_bound).any():
                        log.append({
                            'Feature': feature,
                            'Method': method,
                            'Replacement Value': upper_bound
                        })

            # Create a concise log DataFrame
            log_df = pd.DataFrame(log)
            if not log_df.empty:
                log_df.to_csv('outlier_replacement_summary.csv', index=False)
                print("Outlier replacement summary saved as 'outlier_replacement_summary.csv'.")
                display(log_df)
            print("Outlier treatment completed.")

    implement_button.on_click(on_implement_clicked)

    # Display widgets
    display(widgets.VBox([
        widgets.HBox([whisker_button, percentile_95_button, percentile_99_button]),
        implement_button,
        output,
        summary_output
    ]))
    
    return new_df


In [2]:
import pandas as pd
import numpy as np

def generate_test_dataset():
    np.random.seed(42)  # For reproducibility

    # Generate continuous features with some random outliers
    continuous_features = {
        f"Cont_{i}": np.random.normal(loc=50, scale=10, size=100).tolist() for i in range(1, 20)
    }
    # Add outliers to some of the continuous features
    continuous_features["Cont_3"][::10] = [150] * 10  # Outliers in every 10th row
    continuous_features["Cont_1"][::10] = [180] * 10   # Outliers in every 15th row
    continuous_features["Cont_6"][::15] = [-70] * 7   # Outliers in every 15th row
    continuous_features["Cont_7"][::15] = [-50] * 7   # Outliers in every 15th row

    # Generate categorical features
    categorical_features = {
        f"Cat_{i}": np.random.choice(["A", "B", "C", "D"], size=100).tolist() for i in range(1, 11)
    }

    # Combine continuous and categorical features
    dataset = {**continuous_features, **categorical_features}
    df = pd.DataFrame(dataset)

    return df

# Generate the dataset
test_dataset = generate_test_dataset()


In [3]:
new_df = outlier_replacement_tool(test_dataset)

VBox(children=(HBox(children=(Button(button_style='primary', description='Whisker Method', style=ButtonStyle()…

In [4]:
new_df

Unnamed: 0,Cont_1,Cont_2,Cont_3,Cont_4,Cont_5,Cont_6,Cont_7,Cont_8,Cont_9,Cont_10,...,Cat_1,Cat_2,Cat_3,Cat_4,Cat_5,Cat_6,Cat_7,Cat_8,Cat_9,Cat_10
0,79.129707,35.846293,82.053589,41.710050,34.055723,-70.000000,-50.000000,44.772770,59.382838,53.686733,...,A,A,D,A,A,C,C,B,D,C
1,48.617357,45.793547,55.607845,44.398190,44.006250,67.987515,40.778347,60.490092,44.839553,46.066612,...,A,B,B,A,A,A,A,C,D,B
2,56.476885,46.572855,60.830512,57.472936,50.052437,36.014324,58.696059,42.956563,50.961208,50.287448,...,B,C,A,A,B,A,A,B,C,D
3,65.230299,41.977227,60.538021,56.103703,50.469806,55.629692,63.556379,35.915387,45.377247,62.784519,...,D,C,B,C,C,A,A,B,B,A
4,47.658466,48.387143,36.223306,49.790984,45.499345,43.493574,54.134349,34.433708,45.655038,51.910991,...,A,D,C,A,B,B,B,A,A,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,35.364851,53.853174,43.070904,45.308243,55.389100,44.899836,46.904536,60.531529,56.487099,47.188997,...,B,D,C,B,A,A,B,B,B,A
96,52.961203,41.161426,58.995999,32.868655,39.627538,47.301251,53.261330,49.604448,48.328819,67.976865,...,A,B,A,D,D,B,B,D,D,B
97,52.610553,51.537251,53.072995,63.538724,48.096613,40.212363,37.488864,56.815007,51.467137,56.408429,...,C,A,A,D,A,C,B,D,C,D
98,50.051135,50.582087,58.128621,48.854602,41.243817,45.557067,59.240270,50.283184,62.065090,44.288210,...,D,D,A,D,D,D,A,B,B,C
