In [1]:
import numpy as np
import time
import plotly.graph_objs as go
import plotly.express as px

In [2]:
# Generate a noisy signal
noisy_signal = np.cumsum(np.random.normal(0, 1, 100))

# Add outliers at specific indices
outlier_indices = np.random.choice(len(noisy_signal), size=5, replace=False)
noisy_signal[outlier_indices] += np.random.normal(2, 10, len(outlier_indices))


In [3]:
def plot_signal(noisy_signal, signal_corrected=None):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=np.arange(len(noisy_signal)), y=noisy_signal, mode='markers', name='Noisy Signal'))
    if signal_corrected is None or len(signal_corrected) == 0:
        pass
    else:
        fig.add_trace(go.Scatter(x=np.arange(len(signal_corrected)), y=signal_corrected, mode='markers', name='Signal corrected'))


    fig.show()


# thresholds = [1.5, 3, 5, 7, 9, 11, 13, 15]
# times=[]

# for t in thresholds:
#     start_time = time.time()
#     cleaned_signal = remove_outliers_iqr_diffs(noisy_signal,threshold=10)
#     times.append(1000*(time.time()-start_time))

# px.scatter(x=list(range(len(thresholds))), y=times)


In [4]:
plot_signal(noisy_signal, signal_corrected=None)

In [None]:
def remove_outliers_iqr_diffs(signal, outliers_threshold=1.5):

    if outliers_threshold > 0.0:

        differential_counts = np.abs(np.diff(signal, n=2, prepend=signal[0], append=signal[-1]))
        q1, q3 = np.percentile(differential_counts, [25, 75])
        iqr = q3 - q1
        outliers_idxs = np.where((differential_counts < q1 - outliers_threshold * iqr) | (differential_counts > q3 + outliers_threshold *  iqr))[0]
        outlier_groups =  np.split(outliers_idxs, np.where(np.diff(outliers_idxs)>1)[0]+1) #neighboring points are also classified as outliers. This groups an outlier and its neighbors
        outliers_idxs_no_neighbors = [group[np.argmax(differential_counts[group])] for group in outlier_groups] #This select the outlier as the maximum value among its neigbors
    else:
        outliers_idxs_no_neighbors = []

    signal_cleaned = signal.copy()
    signal_cleaned[outliers_idxs_no_neighbors] = np.nan

    return signal_cleaned

cleaned_signal = remove_outliers_iqr_diffs(noisy_signal, outliers_threshold=1.5)
plot_signal(noisy_signal, signal_corrected=cleaned_signal)

In [None]:
outliers_threshold = 1.5
differential_counts = np.abs(np.diff(noisy_signal, n=2, prepend=noisy_signal[0], append=noisy_signal[-1]))
q1,q3 = np.percentile(differential_counts, [25, 75])
iqr = q3 - q1
outliers_idxs = np.where((differential_counts < q1 - outliers_threshold * iqr) | (differential_counts > q3 + outliers_threshold *  iqr))[0]
outlier_groups =  np.split(outliers_idxs, np.where(np.diff(outliers_idxs)>1)[0]+1)
outliers_idxs_no_neighbors = [group[np.argmax(differential_counts[group])] for group in outlier_groups]

# grouped_indices = np.split(outlier_indices, np.where(diffs != 1)[0])[1:]
plot_signal(differential_counts)

In [None]:
outliers_idxs.size

In [None]:
outliers_idxs

In [52]:
outliers_idxs
print(outliers_idxs)
groups = np.split(outliers_idxs, np.where(np.diff(outliers_idxs)>1)[0]+1)
groups

[ 4 13 14 15 16 17 18 87 88 89]


[array([4], dtype=int64),
 array([13, 14, 15, 16, 17, 18], dtype=int64),
 array([87, 88, 89], dtype=int64)]

In [60]:
outliers_idxs_no_neighbors

[4, 14, 88]

In [None]:
def remove_outliers_rollling(signal, window_size, threshold=1.5):
    # Calculate rolling median using convolution
    rolling_median = np.convolve(signal, np.ones(window_size)/window_size, mode='valid')

    # Calculate residuals (absolute differences from the rolling median)
    residuals = np.abs(signal[window_size-1:] - rolling_median)

    # Calculate rolling IQR using percentile
    rolling_iqr = np.percentile(np.abs(signal[window_size-1:] - rolling_median), 75, axis=0) - \
                  np.percentile(np.abs(signal[window_size-1:] - rolling_median), 25, axis=0)

    # Identify outliers based on the threshold
    outliers = residuals > threshold * rolling_iqr

    # Create a mask to remove outliers
    mask = np.ones(len(signal), dtype=bool)
    mask[window_size-1:] = ~outliers

    # Replace outliers with NaNs
    cleaned_signal = signal.copy()
    cleaned_signal[~mask] = np.nan

    return cleaned_signal

start_time = time.time()
noisy_signal_rolling = remove_outliers_rollling(noisy_signal, window_size=10, threshold=1.5)
print(time.time()-start_time)
plot_signal(noisy_signal, signal_corrected=noisy_signal_rolling)

In [16]:
outliers_threshold=0.5
    
if outliers_threshold > 0.0:

    differential_counts = np.abs(np.diff(noisy_signal, n=2, prepend=noisy_signal[0], append=noisy_signal[-1]))
    q1, q3 = np.percentile(differential_counts, [25, 75])
    iqr = q3 - q1

    outliers_idxs = np.where((differential_counts < q1 - outliers_threshold * iqr) | (differential_counts > q3 + outliers_threshold *  iqr))[0]

    if outliers_idxs.size != 0: #if there are no outliers
 
        outlier_groups =  np.split(outliers_idxs, np.where(np.diff(outliers_idxs)>1)[0]+1) #neighboring points are also classified as outliers. This groups an outlier and its neighbors
        outliers_idxs_no_neighbors = [group[np.argmax(differential_counts[group])] for group in outlier_groups] #This select the outlier as the maximum value among its neigbors
        print(outliers_idxs_no_neighbors)
        print(noisy_signal)

        new_counts = noisy_signal.copy()
        new_counts[outliers_idxs_no_neighbors] = np.nan

else:
    new_counts = noisy_signal


plot_signal(noisy_signal, signal_corrected=new_counts)

[4, 16, 24, 48, 77, 80]
[  1.02734471   0.28366341   0.59481798  -0.09588152  -7.84995835
  -1.13686863   0.11391808  -0.75033524  -1.38160315  -3.4657507
  -4.4485638   -3.35811743  -3.2028435   -3.71120005  -3.94717774
  -4.42898546   0.8340074   -4.46405464  -5.67755613  -7.74433377
  -8.31015135  -9.43274104  -9.92944194 -12.52069934  -6.56221726
 -13.51629201 -13.43785218 -12.71762825 -12.54403071 -12.35545502
 -13.30795913 -13.37639089 -13.28050323 -10.96009583 -11.20788185
  -9.44020475 -10.03046817 -10.57769647  -8.73648651  -8.40781641
  -7.33048175  -8.30357975  -7.17817337  -6.18989341  -6.83678233
  -5.44408779  -7.04566889  -7.02265182 -10.28090586  -8.7440281
 -10.46492883  -9.85897314  -9.90010601 -10.81006066 -11.20063996
 -11.38749597 -10.80775432 -10.28946137 -11.09379238  -9.52956796
  -9.23461503 -10.1486298   -9.93107966  -8.44527754  -8.60647131
 -10.04630933 -10.21314931 -10.69960893 -11.36185737 -10.74369664
 -10.75900915 -10.64271124  -9.63920284  -8.95468563  

In [19]:
xxxx = np.interp(np.arange(len(new_counts)), 
          np.arange(len(new_counts))[~np.isnan(new_counts)], 
          new_counts[np.isnan(new_counts) == False])
plot_signal(noisy_signal, signal_corrected=xxxx)
np.isnan(new_counts)

array([False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

# General code tests

In [64]:
import pandas as pd

In [66]:
data = {
    'column_1': [1, 2, 3, 4],
    'column_2': ['A', 'B', 'C', 'D'],
    'column_3': ['D', 'E', 'F', 'G']
}
df = pd.DataFrame(data)

# Getting the values of the first two columns into a list of tuples
list_of_tuples = list(zip(df['column_1'], df['column_2']))
third_col = df['column_3'].to_list()
print(list_of_tuples, third_col)

[(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D')] ['D', 'E', 'F', 'G']
