[Reference](https://faun.pub/missing-data-in-python-no-problem-a7b8d7c91a0a)

# Creating a Sample Dataset

In [1]:
import pandas as pd
import numpy as np

# Creating a sample dataset
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, np.nan, 30, np.nan, 28],
    'Salary': [50000, 54000, np.nan, 62000, 58000]
}
df = pd.DataFrame(data)

print("Sample DataFrame:\n", df)

Sample DataFrame:
       Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob   NaN  54000.0
2  Charlie  30.0      NaN
3    David   NaN  62000.0
4      Eve  28.0  58000.0


# Total sum of Missing Values

In [3]:
import pandas as pd
import numpy as np

# Creating a sample dataset
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, np.nan, 30, np.nan, 28],
    'Salary': [50000, 54000, np.nan, 62000, 58000]
}
df = pd.DataFrame(data)

# print("Sample DataFrame:\n", df)
# Calculating the total number of null values in the DataFrame
total_null_values = df.isnull().sum().sum()


print("Total number of null values in the DataFrame:", total_null_values)

Total number of null values in the DataFrame: 3


# Total sum of Missing Values in Each Column

In [4]:
import pandas as pd
import numpy as np

# Creating a sample dataset
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, np.nan, 30, np.nan, 28],
    'Salary': [50000, 54000, np.nan, 62000, 58000]
}
df = pd.DataFrame(data)

# print("Sample DataFrame:\n", df)



# Summing up all the null values in each column
null_values_sum = df.isnull().sum()


print("Sum of null values in each column:\n", null_values_sum)

Sum of null values in each column:
 Name      0
Age       2
Salary    1
dtype: int64


# Checking the Percentage of Missing Values

In [5]:
# Percentage of missing values per column
missing_percentage = (df.isnull().sum() / len(df)) * 100
print("\nPercentage of missing values per column:\n", missing_percentage)


Percentage of missing values per column:
 Name       0.0
Age       40.0
Salary    20.0
dtype: float64


# Dropping Missing Values

In [6]:
# Drop rows with any missing values
df_dropped_rows = df.dropna()
print("\nDataFrame after dropping rows with missing values:\n", df_dropped_rows)


DataFrame after dropping rows with missing values:
     Name   Age   Salary
0  Alice  25.0  50000.0
4    Eve  28.0  58000.0


In [7]:
# Drop columns with any missing values
df_dropped_columns = df.dropna(axis=1)
print("\nDataFrame after dropping columns with missing values:\n", df_dropped_columns)


DataFrame after dropping columns with missing values:
       Name
0    Alice
1      Bob
2  Charlie
3    David
4      Eve


# Filling Missing Values

In [8]:
# Fill missing values with a specific value
df_filled_zero = df.fillna(0)
print("\nDataFrame after filling missing values with 0:\n", df_filled_zero)


DataFrame after filling missing values with 0:
       Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob   0.0  54000.0
2  Charlie  30.0      0.0
3    David   0.0  62000.0
4      Eve  28.0  58000.0


In [9]:
# Fill missing values with the mean of the column
df_filled_mean = df.copy()
df_filled_mean['Age'].fillna(df['Age'].mean(), inplace=True)
print("\nDataFrame after filling 'Age' missing values with mean:\n", df_filled_mean)

# Sometimes you might need to use np.mean instead of .mean()

# Fill missing values with the mean of the 'Age' column using np.mean
df_filled_mean = df.copy()
df_filled_mean['Age'].fillna(np.mean(df['Age']), inplace=True)
print("\nDataFrame after filling 'Age' missing values with mean:\n", df_filled_mean)


DataFrame after filling 'Age' missing values with mean:
       Name        Age   Salary
0    Alice  25.000000  50000.0
1      Bob  27.666667  54000.0
2  Charlie  30.000000      NaN
3    David  27.666667  62000.0
4      Eve  28.000000  58000.0

DataFrame after filling 'Age' missing values with mean:
       Name        Age   Salary
0    Alice  25.000000  50000.0
1      Bob  27.666667  54000.0
2  Charlie  30.000000      NaN
3    David  27.666667  62000.0
4      Eve  28.000000  58000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filled_mean['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filled_mean['Age'].fillna(np.mean(df['Age']), inplace=True)


In [10]:
# Forward fill (fill missing values with previous row's value)
df_ffill = df.fillna(method='ffill')
print("\nDataFrame after forward fill:\n", df_ffill)

# Backward fill (fill missing values with next row's value)
df_bfill = df.fillna(method='bfill')
print("\nDataFrame after backward fill:\n", df_bfill)


DataFrame after forward fill:
       Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob  25.0  54000.0
2  Charlie  30.0  54000.0
3    David  30.0  62000.0
4      Eve  28.0  58000.0

DataFrame after backward fill:
       Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob  30.0  54000.0
2  Charlie  30.0  62000.0
3    David  28.0  62000.0
4      Eve  28.0  58000.0


  df_ffill = df.fillna(method='ffill')
  df_bfill = df.fillna(method='bfill')


# Using Interpolation for Numerical Data

In [11]:
# Interpolate missing values
df_interpolated = df.interpolate()
print("\nDataFrame after interpolation:\n", df_interpolated)


DataFrame after interpolation:
       Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob  27.5  54000.0
2  Charlie  30.0  58000.0
3    David  29.0  62000.0
4      Eve  28.0  58000.0


  df_interpolated = df.interpolate()


# Using Scikit-Learn’s Imputer for Machine Learning

In [12]:
from sklearn.impute import SimpleImputer

# Impute missing values in the Salary column with the median
imputer = SimpleImputer(strategy='median')
df[['Salary']] = imputer.fit_transform(df[['Salary']])

print("\nDataFrame after imputing 'Salary' with median:\n", df)


DataFrame after imputing 'Salary' with median:
       Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob   NaN  54000.0
2  Charlie  30.0  56000.0
3    David   NaN  62000.0
4      Eve  28.0  58000.0
