<div class="alert alert-warning" role="alert">
    <b style="font-size: 1.5em;">🚧 Warning</b>
    <p>
    Just as each person is a new door to a different world, <b>missing values</b> come in different shapes and colors. When working with missing values, it is crucial to understand their various representations. Even if the dataset appears to contain no missing values, you must be able to look beyond what is immediately visible to unveil what lies hidden beneath the surface.
    </p>
</div>

In [25]:
import pandas as pd
import sys
import pyprojroot
import numpy as np
sys.path.append(str(pyprojroot.here()))
from src.utils import make_dir_function
from src.pandas_missing_extension import MissingMethods

### Common missing values

In [5]:
common_na_strings = (
    "missing",
    "NA",
    "N A",
    "N/A",
    "#N/A",
    "NA ",
    " NA",
    "N /A",
    "N / A",
    " N / A",
    "N / A ",
    "na",
    "n a",
    "n/a",
    "na ",
    " na",
    "n /a",
    "n / a",
    " a / a",
    "n / a ",
    "NULL",
    "null",
    "",
    "?",
    "*",
    ".",
)

In [9]:
missing_data_example_df = pd.DataFrame.from_dict(
    dict(
        x = [1, 3, "NA", -99, -98, -99],
        y = ["A", "N/A", "NA", "E", "F", "G"],
        z = [-100, -99, -98, -101, -1, -1]
    )
)

missing_data_example_df

Unnamed: 0,x,y,z
0,1.0,A,-100
1,3.0,,-99
2,,,-98
3,-99.0,E,-101
4,-98.0,F,-1
5,-99.0,G,-1


In [None]:
# It won´t count the NA or N/A as missing values
missing_data_example_df.missing.number_missing()

0

### How to detect missing values?

In [11]:
missing_data_example_df.x.unique()

array([1, 3, 'NA', -99, -98], dtype=object)

In [None]:
missing_data_example_df.select_dtypes(object).apply(pd.unique)

x     [1, 3, NA, -99, -98]
y    [A, N/A, NA, E, F, G]
dtype: object

### How to deal with missing values

When reaidng file

In [16]:
data_dir = make_dir_function("data")
example_data_file = data_dir("raw", "missing_data_encoding_example.csv")
example_data_file

PosixPath('/home/alex/courses/data_scientist/handling_missing_data/data/raw/missing_data_encoding_example.csv')

In [20]:
without_filter = pd.read_csv(
    example_data_file
)
with_filter = pd.read_csv(
    example_data_file,
    na_filter=True,
    na_values=[-99, -1]
)
print(without_filter)
print(with_filter)

      x    y    z
0   1.0    A -100
1   3.0  NaN  -99
2   NaN  NaN  -98
3 -99.0    E -101
4 -98.0    F   -1
5 -99.0    G   -1
      x    y      z
0   1.0    A -100.0
1   3.0  NaN    NaN
2   NaN  NaN  -98.0
3   NaN    E -101.0
4 -98.0    F    NaN
5   NaN    G    NaN


Replace the DF values

In [27]:
print(missing_data_example_df)
(missing_data_example_df.replace(
    to_replace=[-99, "NA"],
    value=np.nan
))

     x    y    z
0    1    A -100
1    3  N/A  -99
2   NA   NA  -98
3  -99    E -101
4  -98    F   -1
5  -99    G   -1


Unnamed: 0,x,y,z
0,1.0,A,-100.0
1,3.0,,
2,,,-98.0
3,,E,-101.0
4,-98.0,F,-1.0
5,,G,-1.0


Targeted substitution

In [30]:
print(missing_data_example_df)
(missing_data_example_df.replace(
    to_replace={
        "x": {
            -99: np.nan,
            "NA": np.nan
        },
        "y": {
            "N/A": np.nan
        }
    }
))

     x    y    z
0    1    A -100
1    3  N/A  -99
2   NA   NA  -98
3  -99    E -101
4  -98    F   -1
5  -99    G   -1


Unnamed: 0,x,y,z
0,1.0,A,-100
1,3.0,,-99
2,,,-98
3,,E,-101
4,-98.0,F,-1
5,,G,-1
