# Outlier Handler


In [None]:
from typing import Union
import pandas as pd
import numpy as np
from scipy.stats import zscore


### Detect Outliers


In [None]:
def identify_outliers_iqr(dataframe: pd.DataFrame, column: Union[str, int], threshold: float = 1.5) -> pd.DataFrame:
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    
    outliers = dataframe[(dataframe[column] < lower_bound) | (dataframe[column] > upper_bound)]
    return outliers.index


In [None]:
def identify_outliers_zscore(dataframe: pd.DataFrame, column: Union[str, int], threshold=3):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
        
    z_scores = np.abs(zscore(dataframe[column]))
    outliers = dataframe[z_scores > threshold].index
    return outliers


In [None]:
def identify_outliers_frequency(dataframe: pd.DataFrame, column: Union[str, int], threshold=0.05):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    value_counts = dataframe[column].value_counts(normalize=True)
    rare_values = value_counts[value_counts < threshold].index
    outlier_indices = dataframe[dataframe[column].isin(rare_values)].index
    
    return outlier_indices


### Handle Outliers


In [None]:
def handle_outliers(dataframe: pd.DataFrame, column: Union[str, int], outlier_indices: pd.Index, method : str ='drop'):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    df_copy = dataframe.copy()
    
    if method == 'drop':
        df_copy.drop(outlier_indices, inplace=True)
        # AFTER DELETE OUTLIERS, CALL THE DESIRED FILLING FUNCTION FROM MISSINGVALUEHANDLER !!!
    elif method == 'log':
        df_copy[column] = df_copy[column].apply(lambda x: np.log(x) if x > 0 else np.nan)
    elif method == 'sqrt':
        df_copy[column] = df_copy[column].apply(lambda x: np.sqrt(x) if x >= 0 else np.nan)
        
    return df_copy


### Test


In [None]:
df = pd.read_csv("dataset/movies.csv")
df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')
identify_outliers_iqr(df, 'Rating')
