[Reference](https://medium.com/@tubelwj/guide-to-handling-missing-values-in-pandas-columns-e9580a139400)

# Missing Value Statistics

In [1]:
import pandas as pd
import numpy as np
from tabulate import tabulate  # Import tabulate to format the output

# Create a DataFrame with random missing values
def get_random_missing_data(rows=10, cols=5, missing_ratio=0.2):
    # Generate a random DataFrame with decimal values rounded to 2 places
    data = np.random.rand(rows, cols).round(2)
    df = pd.DataFrame(data, columns=[f"col_{chr(65+i)}" for i in range(cols)])  # col_A, col_B, ...

    # Introduce missing values based on the missing ratio
    num_missing = int(rows * cols * missing_ratio)
    missing_indices = np.random.choice(df.size, num_missing, replace=False)

    for idx in missing_indices:
        row = idx // cols
        col = idx % cols
        df.iat[row, col] = np.nan  # Set missing values

    # Return the rows that contain missing values
    return df[df.isna().any(axis=1)]  # Only return rows that have at least one missing value

# Call the function to get rows with missing values
df = get_random_missing_data()

# Print the DataFrame using tabulate for better readability
print(tabulate(df, headers='keys', tablefmt='pretty'))

+---+-------+-------+-------+-------+-------+
|   | col_A | col_B | col_C | col_D | col_E |
+---+-------+-------+-------+-------+-------+
| 0 | 0.09  |  nan  | 0.12  | 0.88  | 0.26  |
| 1 | 0.61  |  nan  | 0.86  | 0.41  |  nan  |
| 2 | 0.68  | 0.68  |  nan  | 0.48  | 0.72  |
| 3 | 0.13  | 0.68  |  0.2  |  nan  |  nan  |
| 4 | 0.25  | 0.12  | 0.51  | 0.35  |  nan  |
| 6 | 0.98  |  nan  | 0.98  | 0.25  | 0.39  |
| 8 | 0.35  | 0.46  | 0.17  |  nan  | 0.45  |
| 9 | 0.66  | 0.78  | 0.28  |  nan  | 0.38  |
+---+-------+-------+-------+-------+-------+


In [2]:
df.isna().sum()

Unnamed: 0,0
col_A,0
col_B,3
col_C,1
col_D,3
col_E,3


In [3]:
df.isna().mean()

Unnamed: 0,0
col_A,0.0
col_B,0.375
col_C,0.125
col_D,0.375
col_E,0.375


# Deleting Missing Values

In [4]:
import pandas as pd
import numpy as np

# Create a DataFrame with columns named 'col_A', 'col_B', etc., and random floating-point numbers
df = pd.DataFrame(
    {
        "col_A": [1.23, 2.34, 3.45, np.nan],
        "col_B": [1.12, np.nan, 3.67, 4.89],
        "col_C": [1.78, 2.56, np.nan, 4.23],
        "col_D": [1.22, 2.34, 3.45, 4.56],
    }
)

# Drop rows with any missing values
df_cleaned = df.dropna(how="any", axis=0)

# Print the cleaned DataFrame
print(df_cleaned)

   col_A  col_B  col_C  col_D
0   1.23   1.12   1.78   1.22


In [5]:
import pandas as pd
import numpy as np

# Create a DataFrame with random float values and missing values
df = pd.DataFrame(
    {
        "col_A": [1.23, np.nan, 3.45, np.nan],
        "col_B": [1.67, np.nan, 3.89, 4.56],
        "col_C": [1.01, np.nan, np.nan, 4.23],
        "col_D": [1.11, np.nan, 3.67, 4.89],
    }
)

# Drop rows where all values are missing
df_cleaned = df.dropna(how="all", axis=0)

print(df_cleaned)

   col_A  col_B  col_C  col_D
0   1.23   1.67   1.01   1.11
2   3.45   3.89    NaN   3.67
3    NaN   4.56   4.23   4.89


In [6]:
import pandas as pd
import numpy as np

# Create a DataFrame with random float values and missing values
df = pd.DataFrame(
    {
        "col_A": [1.23, 2.34, 3.45, np.nan],
        "col_B": [1.56, np.nan, 3.67, 4.78],
        "col_C": [1.89, 2.90, np.nan, 4.12],
        "col_D": [1.01, 2.11, 3.22, 4.33],
    }
)

# Drop columns where any value is missing
df_cleaned = df.dropna(how="any", axis=1)

print(df_cleaned)

   col_D
0   1.01
1   2.11
2   3.22
3   4.33


In [7]:
import pandas as pd
import numpy as np

# Create a DataFrame with random float values and missing values
df = pd.DataFrame(
    {
        "col_A": [np.nan, np.nan, np.nan, np.nan],
        "col_B": [1.23, np.nan, 3.45, 4.56],
        "col_C": [1.67, 2.89, np.nan, 4.12],
        "col_D": [1.11, np.nan, 3.33, 4.44],
    }
)

# Drop columns where all values are missing
df_cleaned = df.dropna(how="all", axis=1)

print(df_cleaned)

   col_B  col_C  col_D
0   1.23   1.67   1.11
1    NaN   2.89    NaN
2   3.45    NaN   3.33
3   4.56   4.12   4.44


# Filling Missing Values

In [8]:
import pandas as pd
import numpy as np

# DataFrame with NaN values, columns named col_A, col_B, etc., and floating point values rounded to two decimal places
df = pd.DataFrame(
    {
        "col_A": [1.00, 2.00, 3.00, np.nan],
        "col_B": [1.00, np.nan, 3.00, 4.00],
        "col_C": [1.00, 2.00, np.nan, 4.00],
        "col_D": [1.00, 2.00, 3.00, 4.00],
    }
)

# Fill missing values with -1
print("Fill NaN with -1:")
print(df.fillna(-1))

Fill NaN with -1:
   col_A  col_B  col_C  col_D
0    1.0    1.0    1.0    1.0
1    2.0   -1.0    2.0    2.0
2    3.0    3.0   -1.0    3.0
3   -1.0    4.0    4.0    4.0


In [9]:
import pandas as pd
import numpy as np

# DataFrame with NaN values, columns named col_A, col_B, etc., and floating point values rounded to two decimal places
df = pd.DataFrame(
    {
        "col_A": [1.00, 2.00, 3.00, np.nan],
        "col_B": [1.00, np.nan, 3.00, 4.00],
        "col_C": [1.00, 2.00, np.nan, 4.00],
        "col_D": [1.00, 2.00, 3.00, 4.00],
    }
)

# Fill missing values with -1
print("Fill NaN with -1:")
print(df.fillna(-1))

Fill NaN with -1:
   col_A  col_B  col_C  col_D
0    1.0    1.0    1.0    1.0
1    2.0   -1.0    2.0    2.0
2    3.0    3.0   -1.0    3.0
3   -1.0    4.0    4.0    4.0


In [10]:
import pandas as pd
import numpy as np

# DataFrame with NaN values, columns named col_A, col_B, etc., and floating point values rounded to two decimal places
df = pd.DataFrame(
    {
        "col_A": [1.00, 2.00, 3.00, np.nan],
        "col_B": [1.00, np.nan, 3.00, 4.00],
        "col_C": [1.00, 2.00, np.nan, 4.00],
        "col_D": [1.00, 2.00, 3.00, 4.00],
    }
)

# Fill missing values with -1
print("Fill NaN with -1:")
print(df.fillna(-1))

Fill NaN with -1:
   col_A  col_B  col_C  col_D
0    1.0    1.0    1.0    1.0
1    2.0   -1.0    2.0    2.0
2    3.0    3.0   -1.0    3.0
3   -1.0    4.0    4.0    4.0
