In [3]:
from typing import Any, List, Tuple
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification
from scipy.stats import zscore

# IQR

In [4]:
def detect_outliers_in_column(column: pd.Series, threshold: float = 1.5) -> list:
    """
    Detect outliers in a column of a dataframe using IQR (interquartile range) method
    :param column: column of a dataframe (pd.Series)
    :param threshold: threshold for outlier detection
    :return: list of indices of outliers
    """
    q1 = column.quantile(0.25)
    q3 = column.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - threshold * iqr
    upper_bound = q3 + threshold * iqr

    outlier_indices = column[(column < lower_bound) | (column > upper_bound)].index.tolist()

    return outlier_indices


def outliers_labeling(df: pd.DataFrame, threshold: float = 1.5, inplace: bool = False,
                      label: np.nan or Any = np.nan) -> pd.DataFrame:
    """
    Label outliers in a dataframe using detect_outliers_in_column function with selected label
    :param df: raw dataframe which will be modified to include=np.number columns
    :param threshold: threshold for outlier detection
    :param inplace: option to modify the original dataframe or return a new one
    :param label: label for outliers with default value of nan
    :return: dataframe with labeled outliers
    """
    df_working = df.copy()
    df_working = df_working[df_working.select_dtypes(include=np.number).columns]

    for col in df_working.columns:

        if set(df_working[col].unique()) == {0, 1}:
            continue

        outlier_indices = detect_outliers_in_column(df_working[col], threshold)
        df_working.loc[outlier_indices, col] = label

    if inplace:
        df[df_working.columns] = df_working
        return df
    else:
        return df_working


def count_nan_in_row(df: pd.DataFrame, threshold: float = 1.5, inplace: bool = False) -> pd.DataFrame:
    """
    Count number of nan values in each row of a dataframe
    :param df: raw dataframe which will be modified to include=np.number columns
    :param threshold: threshold for outlier detection
    :param inplace: option to modify the original dataframe or return a new one
    :return: dataframe with number of nan values in each row
    """
    df_working = df.copy()
    df_working = df_working[df_working.select_dtypes(include=np.number).columns]

    df_working = outliers_labeling(df_working, threshold)
    df_working['outliers_count'] = df_working.isnull().sum(axis=1)

    if inplace:
        df['outliers_count'] = df_working['outliers_count']
        return df
    else:
        return df_working['outliers_count']


def pca_outliers_count(df: pd.DataFrame, threshold: float = 1.5) -> pd.DataFrame:
    """
    Count number of outliers in each row of a dataframe using PCA
    :param df: raw dataframe which will be modified to include=np.number columns and PCA columns
    :param threshold: threshold for outlier detection
    :return: dataframe with number of outliers in each row
    """
    df_working = df.copy()
    df_working_numeric = df_working.select_dtypes(include=np.number)

    pca = PCA(n_components=2)
    df_working_pca = pca.fit_transform(df_working_numeric)
    df_working_pca = pd.DataFrame(df_working_pca, columns=['PCA1', 'PCA2'])

    print(f'PCA explained variance ratio: {pca.explained_variance_ratio_}')

    return count_nan_in_row(df_working_pca, threshold, inplace=False)

In [5]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42)

df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(1, 21)])
df['target'] = y

In [6]:
df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,target
0,-4.906442,3.442789,0.558964,-0.976764,-1.568805,-4.271982,-3.727921,0.111868,2.119795,-2.522812,...,-7.492478,4.264669,0.304866,0.777693,-9.375464,1.654446,3.012859,-4.497003,-2.520066,0
1,2.16261,-5.286651,2.609846,-1.803898,-1.831216,1.450757,2.648709,2.152307,0.524552,0.493548,...,6.680603,-2.43183,2.462773,-1.254824,2.978402,-3.428457,-4.562178,3.698665,-1.923286,1
2,-4.784844,-3.744827,4.657592,-1.408806,-5.444758,-2.416013,3.556495,-1.572119,-0.730549,3.447661,...,7.961059,-5.151105,0.473131,-4.070667,-0.932309,-3.230768,-7.844646,2.803798,-2.963189,1
3,10.465024,1.070944,-3.562432,-0.849062,2.18386,-0.609893,0.946327,-1.046141,-2.057053,-2.05665,...,-1.449095,-1.217685,2.026805,2.121829,3.184256,-1.960146,0.782147,-1.444202,0.915985,0
4,5.599516,-1.776412,-1.304322,-0.720074,5.859373,-3.292432,3.152205,7.099882,-3.321076,3.245486,...,6.608729,5.632297,-1.943748,1.169455,3.782513,-4.752822,-7.577624,4.868025,1.70821,0


In [7]:
outliers_labeling(df, threshold=1.5)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,target
0,-4.906442,3.442789,0.558964,-0.976764,-1.568805,-4.271982,-3.727921,0.111868,2.119795,-2.522812,...,-7.492478,4.264669,0.304866,0.777693,-9.375464,1.654446,3.012859,-4.497003,-2.520066,0
1,2.162610,-5.286651,2.609846,-1.803898,-1.831216,1.450757,2.648709,2.152307,0.524552,0.493548,...,6.680603,-2.431830,2.462773,-1.254824,2.978402,-3.428457,-4.562178,3.698665,-1.923286,1
2,-4.784844,-3.744827,4.657592,-1.408806,-5.444758,-2.416013,3.556495,-1.572119,-0.730549,3.447661,...,7.961059,-5.151105,0.473131,-4.070667,-0.932309,-3.230768,-7.844646,2.803798,-2.963189,1
3,10.465024,1.070944,-3.562432,-0.849062,2.183860,-0.609893,0.946327,-1.046141,-2.057053,-2.056650,...,-1.449095,-1.217685,2.026805,2.121829,3.184256,-1.960146,0.782147,-1.444202,0.915985,0
4,5.599516,-1.776412,-1.304322,-0.720074,5.859373,-3.292432,3.152205,,-3.321076,3.245486,...,6.608729,5.632297,-1.943748,1.169455,3.782513,-4.752822,-7.577624,4.868025,1.708210,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-4.075791,1.107373,-0.932613,-2.595627,-0.574878,2.267418,0.796662,1.653108,-0.238632,1.942756,...,4.162973,-0.775081,-3.287488,-1.626968,0.295721,0.076559,1.457791,-0.175254,2.255203,1
996,-6.462251,-9.573656,1.882201,-1.504172,-2.967003,0.319121,0.718901,-1.068637,-0.619096,5.789733,...,5.774931,0.721375,1.797997,-5.200017,11.284526,-5.930419,0.116175,0.290137,2.845911,1
997,-10.412372,-5.324621,-1.038058,2.017816,-3.426152,1.361407,-2.872490,1.384120,0.262338,-0.550348,...,5.915825,-2.102402,-3.118252,-3.257651,0.466787,-1.556149,8.978047,3.577869,4.281596,1
998,0.288200,2.838634,2.799691,0.939651,0.139238,1.257150,1.215159,-0.674084,-2.951852,4.588736,...,-10.337587,0.416833,-2.631571,2.487803,-6.668468,3.004733,-6.313930,-1.851295,-4.920264,1


In [8]:
df['outliers_in_row_iqr'] = count_nan_in_row(df, threshold=1.5)
df.sort_values('outliers_in_row_iqr', ascending=False).head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,target,outliers_in_row_iqr
397,-12.442543,-2.459275,3.171862,5.067235,-2.780939,1.444773,-1.746286,-5.734828,2.114812,-3.196485,...,-3.39374,-2.579329,-8.560593,-10.082985,4.012569,-1.559773,7.081771,-1.339274,0,3
608,-0.701803,-16.725007,1.609536,-0.475619,-5.280607,-1.537446,2.829144,-0.590706,0.603484,0.988417,...,-4.454336,6.953759,-5.05913,-1.442921,-5.035926,-11.840762,7.829233,-5.707277,1,3
873,4.182081,1.889923,1.166098,2.421076,-0.765722,0.96381,-1.41443,0.911089,-3.466787,2.122968,...,4.611861,-4.840826,6.964789,-5.274794,-0.424702,7.38495,-4.513457,-3.766986,1,2
749,12.06665,-4.374791,-5.823901,2.081024,1.332376,2.041598,1.600493,1.87285,-1.260544,-0.092573,...,5.436412,-2.393311,4.596799,7.433648,-3.443313,14.554311,-2.325073,-2.054129,0,2
572,8.965254,-1.409436,3.224818,-2.343256,0.544765,6.511036,6.730393,1.494908,-2.386115,0.987346,...,2.638753,2.436496,-2.634066,6.698679,-1.390879,-6.226924,-2.571051,-0.808047,1,2


In [9]:
pca_outliers_count(df, threshold=1.5).sort_values(ascending=False).head()

PCA explained variance ratio: [0.26052319 0.20405988]


94     1
28     1
850    1
727    1
915    1
Name: outliers_count, dtype: int64

# zscore

In [10]:
def detect_outliers_in_column(column: pd.Series, threshold: float = 3.0) -> list:
    z_scores = zscore(column)
    outlier_indices = column[(z_scores < -threshold) | (z_scores > threshold)].index.tolist()
    return outlier_indices

def outliers_labeling(df: pd.DataFrame, threshold: float = 3.0,
                      label: np.nan or Any = np.nan) -> pd.DataFrame:
    df_working = df.copy()
    df_working = df_working[df_working.select_dtypes(include=np.number).columns]

    for col in df_working.columns:
        if set(df_working[col].unique()) == {0, 1}:
            continue

        outlier_indices = detect_outliers_in_column(df_working[col], threshold)
        df_working.loc[outlier_indices, col] = label

    return df_working

def count_nan_in_row(df: pd.DataFrame, threshold: float = 3.0) -> pd.DataFrame:
    df_working = df.copy()
    df_working = df_working[df_working.select_dtypes(include=np.number).columns]

    df_working = outliers_labeling(df_working, threshold)
    df_working['outliers_count'] = df_working.isnull().sum(axis=1)

    return df_working['outliers_count']

In [11]:
df['outliers_in_row_zscore'] = count_nan_in_row(df, threshold=3.0)
df.sort_values('outliers_in_row_zscore', ascending=False).head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,target,outliers_in_row_iqr,outliers_in_row_zscore
546,-11.102649,-2.023208,0.007236,0.543936,0.64052,1.489343,-6.017109,-0.640072,1.813126,-2.674619,...,-1.284222,-9.032351,-3.394739,2.11318,8.907846,2.724166,-0.298296,0,2,3
891,-10.867818,-5.593492,1.265347,4.385166,-7.313144,-3.939505,-0.242635,-1.148253,1.762516,-2.920135,...,0.103919,-2.869505,-5.027816,-3.218711,0.321471,5.535868,0.973736,1,2,3
224,-0.174919,10.476314,7.300535,7.460633,2.736197,-3.065765,3.697331,-0.72437,2.807824,-4.149109,...,-3.626593,-0.403161,8.636248,-2.844835,3.381668,0.118299,4.781667,0,2,2
397,-12.442543,-2.459275,3.171862,5.067235,-2.780939,1.444773,-1.746286,-5.734828,2.114812,-3.196485,...,-2.579329,-8.560593,-10.082985,4.012569,-1.559773,7.081771,-1.339274,0,3,2
429,-7.8685,3.698058,-1.382011,5.831259,2.063343,-5.30669,-0.761428,-0.52307,-0.279535,2.475954,...,-9.45671,-0.804665,3.799394,-2.791252,10.179693,2.521716,4.494271,0,2,2
