# Missing Value Handler


In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from typing import Union


### Analyse Missing Values


In [2]:
def calculate_nan_ratios(df: pd.DataFrame, none_values: list = None) -> dict:
    """Calculate none ratio of dataset directly, considering specific values as NaNs."""
    if none_values is not None:
        df = df.replace(none_values, np.nan)
    nan_counts = df.isna().sum()
    total_counts = len(df)
    nan_ratios = nan_counts / total_counts
    return nan_ratios 


In [3]:
def print_nan_ratios(df: pd.DataFrame, none_values: list = None):
    def get_status(ratio):
        if ratio > 0.20:
            return 'Critical'
        elif 0.05 <= ratio <= 0.20:
            return 'Acceptable'
        else:
            return 'Good'
        
    overall_nan_ratio = calculate_nan_ratios(df, none_values)
    
    for column, ratio in overall_nan_ratio.items():
        status = get_status(ratio)
        dtype = df[column].dtype
        if pd.api.types.is_datetime64_any_dtype(df[column]):
            dtype = 'datetime'
        elif pd.api.types.is_string_dtype(df[column]):
            dtype = 'string'
        print(f"'{column}' \nnone value ratio: {ratio:.2%} | Data type: {dtype} | Status: {status}")


### Handle Missing Values


In [4]:
from enum import Enum 
class Strategy(Enum):
    MODE = 0
    MEAN = 1
    MEDIAN = 2
    CONSTANT = 3
    REMOVE_ROW = 4
    REMOVE_COLUMN = 5
    FORWARD = 6
    BACKWARD = 7


In [5]:
def replace_mode(dataframe: pd.DataFrame, column: Union[int, str]) -> pd.DataFrame:
        assert dataframe[column].mode().empty, f"There is no mode value for column '{column}. Skipping mode replacement...'"
        df_copy = dataframe.copy()
        
        mode_value = df[column].mode()[0]
        df_copy[column] = df_copy[column].fillna(mode_value)
        return df_copy


In [6]:
def replace_mean(dataframe: pd.DataFrame, column: Union[int, str]) -> pd.DataFrame:
        df_copy = dataframe.copy()
        if pd.api.types.is_numeric_dtype(df_copy[column]):
            mean_value = df_copy[column].mean()
            df_copy[column] = df_copy[column].fillna(mean_value)
        else:
            raise ValueError(f"Column '{column}' is not numeric. Skipping mean replacement...")
        return df_copy


In [7]:
def replace_median(dataframe: pd.DataFrame, column: Union[int, str]) -> pd.DataFrame:
        assert pd.api.types.is_numeric_dtype(dataframe[column]), f"Column '{column}' is not numeric. Skipping median replacement."

        df_copy = dataframe.copy()
        median_value = df_copy[column].median()
        df_copy[column] = df_copy[column].fillna(median_value)
        
        return df_copy


In [8]:
def replace_constant(dataframe: pd.DataFrame, column: Union[int, str], const: Union[int, str, datetime]) -> pd.DataFrame:
       df_copy = dataframe.copy()
       if pd.api.types.is_numeric_dtype(df_copy[column]) and (isinstance(const, int) or isinstance(const, float)):
           const_value = const
       elif pd.api.types.is_string_dtype(df_copy[column]) and isinstance(const, str):
           const_value = const
       elif pd.api.types.is_datetime64_any_dtype(df_copy[column]) and isinstance(const, datetime):
           const_value = const
       else:
            raise ValueError(f"Unsupported const type for column '{column}'")
        
       df_copy[column] = df_copy[column].fillna(const_value)
       return df_copy


In [9]:
def replace_remove_row(dataframe: pd.DataFrame, column: Union[int, str]) -> pd.DataFrame:
        df_copy = dataframe.copy()
        return df_copy.dropna(subset=[column])

def replace_remove_column(dataframe: pd.DataFrame, column: Union[int, str]) -> pd.DataFrame:
    df_copy = dataframe.copy()
    return df_copy.drop(columns=[column])


In [10]:
def replace_forward_backward(dataframe: pd.DataFrame, column: Union[int, str], method: str = "ffill") -> pd.DataFrame:
    df_copy = dataframe.copy()
    df_copy[column] = df_copy[column].fillna(method)
    return df_copy


In [11]:
def replace_missing_values(dataframe: pd.DataFrame, strategy: Strategy = Strategy.MEAN, column: Union[int, str] = 0, const : Union[int, str, datetime] = np.nan) -> pd.DataFrame:
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    if strategy == Strategy.MODE:
        return replace_mode(dataframe, column)
    elif strategy == Strategy.MEAN:
        return replace_mean(dataframe, column)
    elif strategy == Strategy.MEDIAN:
        return replace_median(dataframe, column)
    elif strategy == Strategy.CONSTANT:
        return replace_constant(dataframe, column, const)
    elif strategy == Strategy.REMOVE_ROW:
        return replace_remove_row(dataframe, column)
    elif strategy == Strategy.REMOVE_COLUMN:
        return replace_remove_column(dataframe, column)
    elif strategy == Strategy.FORWARD:
        return replace_forward_backward(dataframe, column, "ffill")
    elif strategy == Strategy.BACKWARD:
        return replace_forward_backward(dataframe, column, "bfill")
    else:
        raise ValueError("Invalid strategy")


### Test


In [12]:
df = pd.read_csv("dataset/movies.csv")
df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')
print_nan_ratios(df)


'Title' 
none value ratio: 0.00% | Data type: string | Status: Good
'Director' 
none value ratio: 0.00% | Data type: string | Status: Good
'Genre' 
none value ratio: 0.00% | Data type: string | Status: Good
'Release Date' 
none value ratio: 0.00% | Data type: datetime | Status: Good
'Duration' 
none value ratio: 0.00% | Data type: int64 | Status: Good
'Rating' 
none value ratio: 0.00% | Data type: float64 | Status: Good


In [13]:
print(df.info())
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Title         30000 non-null  object        
 1   Director      30000 non-null  object        
 2   Genre         30000 non-null  object        
 3   Release Date  30000 non-null  datetime64[ns]
 4   Duration      30000 non-null  int64         
 5   Rating        30000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 1.4+ MB
None


Unnamed: 0,Release Date,Duration,Rating
count,30000,30000.0,30000.0
mean,2000-07-20 01:58:50.880000,120.131033,5.519867
min,1980-01-01 00:00:00,60.0,1.0
25%,1990-04-19 00:00:00,90.0,3.3
50%,2000-08-11 00:00:00,120.0,5.5
75%,2010-11-04 06:00:00,150.0,7.7
max,2020-12-30 00:00:00,180.0,10.0
std,,34.981221,2.591138
