In [48]:
import pandas as pd
import numpy as np

# FINDING OUTLIERS IN THE DATASET USING INTERQUARTILE RANGE (IQR) CONCEPT
The code below finds and prints the number of outliers in each numerical columns in the dataframe

In [61]:
dataFrame1 = pd.DataFrame({
    'Feature1': [12, 15, 17, 20, 22, 25, 27, 30, 32, 35, 45, 80, 100, 120, -8, -10, -15, -18, -55],
    'Feature2': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 200, 400, 600, 800, -2, -4, -6, -8,-76],
    'Feature3': ['NOT A NUMERIC FEATURE', 0.7, 0.9, 1.1, 1.3, 'HOWDY', 1.7, 1.9, 2.1, 2.3, 23.5, 45.6, 67.8, 89.0, -0.1, -0.2, -0.3, -0.4,-87],
    'Feature4': [-50, -45, -40, -35, -30, -25, -20, -15, -10, -5, 50, 55, 60, 65, 70, 75, 80, 85, 276]
})


### Give this function a dataframe and it will find the number of outliers in each feature with Concept of Interquartile range (IQR)

def find_outliers(data = None, columns = None):
    outliers = {}
    for i in data.columns:
        if pd.api.types.is_numeric_dtype(data[i].dtype): # checks if the column is numeric type or not
            outliers[i] = []  # dict : sets a key with the column name and an empty list as the value 
            sorted_column = data[i].sort_values() # sorts the column in ascending order
            q1, q3 = np.percentile(sorted_column, [25,75]) # sets the 25% and 75% quartile
            IQR = q3-q1 # calculates the Interquartile range
            lower_bracket = q1 - (1.5*IQR) # calculates the lower bracket
            upper_bracket = q3 + (1.5*IQR) # calculates the upper bracket
            for j in sorted_column:
                outliers[i].append(j) if j < lower_bracket or j > upper_bracket else False 
                # ^^^ if the column have any outliers it appends to the outliers list ^^^
            
            print(f'{i} : {len(outliers[i])} outliers')
            print('-'*10)
    return outliers
    
result = find_outliers(data = dataFrame1, columns=['Feature2', 'Feature3', 'Feature1', 'Feature4'])
result

Feature1 : 3 outliers
----------
Feature2 : 5 outliers
----------
Feature4 : 1 outliers
----------


{'Feature1': [-55, 100, 120],
 'Feature2': [-76, 200, 400, 600, 800],
 'Feature4': [276]}

# SEARCH AND DELETE OUTLIERS IN THE DATASET USING INTERQUARTILE RANGE (IQR) CONCEPT
The function below finds and deletes the number of outliers in each numerical columns it is given

In [57]:
dataFrame1 = pd.DataFrame({
    'Feature1': ['NOT A NUMERIC FEATURE', 15, 17, 20, 22, 25, 27, 30, 32, 35, 45, 80, 100, 9000, -8, -10, -15, -18, -55],
    'Feature2': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 200, 40, 600, 8, -2, -4, -6, -8,-76],
    'Feature3': [0.5, 500, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1, 2.3, 23.5, 45.6, 67.8, 2.1, -0.1, -0.2, -0.3, -0.4,-87],
    'Feature4': [-50, -45, -40, -35, -30, -25, -20, -15, -10, -5, 50, 55, 60, 65, 70, 75, 80, 85, 276]
})

### Give this function a dataframe with numerical value and it will find the number of outliers in each feature with Concept of Inter quartile range (IQR)

def delete_outliers(data = None, columns = None):
    for i in columns:
        if pd.api.types.is_numeric_dtype(data[i].dtype): # checks if the column is numeric type or not
            sorted_column = data[i].sort_values() # sorts the column in ascending order
            q1, q3 = np.percentile(sorted_column, [25,75]) # sets the 25% and 75% quartile
            IQR = q3-q1 # calculates the Interquartile range
            lower_bracket = q1 - (1.5*IQR) # calculates the lower bracket
            upper_bracket = q3 + (1.5*IQR) # calculates the upper bracket
            data = data[~((data[i] < lower_bracket) | (data[i] > upper_bracket))]
            # ^^^ keeping the rows that meet the condition (value between upper and lower bracket) ^^^
    return data # returning dataframe without the row of outliers
    
new_df = delete_outliers(data = dataFrame1, columns = ['Feature1', 'Feature2', 'Feature4'])
new_df

Unnamed: 0,Feature1,Feature2,Feature3,Feature4
0,NOT A NUMERIC FEATURE,2,0.5,-50
1,15,4,500.0,-45
2,17,6,0.9,-40
3,20,8,1.1,-35
4,22,10,1.3,-30
5,25,12,1.5,-25
6,27,14,1.7,-20
7,30,16,1.9,-15
8,32,18,2.1,-10
9,35,20,2.3,-5
