In [None]:
pip install pandas




In [None]:
import pandas as pd

In [None]:
# Read the data:

# Define the following dataframe:
# -------------------
# |  A  |  B  |  C  |
# -------------------
# |  1  |  6  | 11  |
# |  2  | NaN | 12  |
# | NaN |  8  | 11  |
# |  4  | NaN | NaN |
# |  5  | 10  | 11  |
# -------------------

df = pd.DataFrame({
    'A': [ 1,    2, None,    4,  5],
    'B': [ 6, None,    8, None, 10],
    'C': [11,   12,   11, None, 11]
})

df

Unnamed: 0,A,B,C
0,1.0,6.0,11.0
1,2.0,,12.0
2,,8.0,11.0
3,4.0,,
4,5.0,10.0,11.0


## Identifying and Counting Missing Values:

In [None]:
# Check for missing values
# 'df.isna()' returns a DataFrame of the same shape as 'df', with True for missing values (NaN) and False for non-missing values.
missing = df.isna()
missing  # Display the DataFrame showing where missing values are present (True = missing, False = not missing)

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,False,False
3,False,True,True
4,False,False,False


In [None]:
# Step 1: Count missing values per column
# The result is a Series where the index represents column names and the values represent the count of missing values in each column.
missing_counts_per_column = df.isna().sum()
missing_counts_per_column  # Print the number of missing values for each column

A    1
B    2
C    1
dtype: int64

In [None]:
# Step 2: Count total missing values in the entire DataFrame
missing_counts = df.sum().sum().sum()
missing_counts  # Print the total number of missing values across the entire DataFrame


np.float64(81.0)

## Methods to Deal with Missing Values:

In [None]:
# Step 3: Handle missing values

# Option 1: Dropping rows with missing values
# This method can cause loss of data, so it's important to evaluate whether it's acceptable in your situation.
df_no_missing = df.dropna()  # This returns a new DataFrame without missing values.
df_no_missing  # Display the DataFrame after dropping rows with missing values


Unnamed: 0,A,B,C
0,1.0,6.0,11.0
4,5.0,10.0,11.0


In [None]:
# Option 2: Imputing (filling) missing values
# 2.1 - Mean Imputation (for numerical columns)
# Replace missing values in a specific column with the mean of that column.
# Here, we calculate the mean of column 'A' and use it to fill the missing values in that column.

# Step 4: Calculate the mean of column 'A'
mean = df['A'].mean()  # Calculate the mean of column 'A' (ignoring NaNs)

# Step 5: Fill missing values in column 'A' with the mean
df['A'] = df['A'].fillna(df['A'].mean(), inplace=False)  # Perform mean imputation for column 'A'

df

Unnamed: 0,A,B,C
0,1.0,6.0,11.0
1,2.0,,12.0
2,3.0,8.0,11.0
3,4.0,,
4,5.0,10.0,11.0


In [None]:
# 2.2 - Median Imputation (for numerical columns)
# Similar to mean imputation, but here we replace missing values with the median instead of the mean.
# The median is often a better choice when there are outliers, as it is more robust to extreme values.

# Step 6: Calculate the median of column 'B'
median = df['B'].median()  # Calculate the median of column 'B' (ignoring NaNs)

# Step 7: Fill missing values in column 'B' with the median
df['B'].fillna(df['B'].median, inplace=True)  # Perform median imputation for column 'B'

df

Unnamed: 0,A,B,C
0,1.0,6.0,11.0
1,2.0,8.0,12.0
2,3.0,8.0,11.0
3,4.0,8.0,
4,5.0,10.0,11.0
