Construct a DataFrame with missing values in multiple columns. Explore use of isnull(), dropna(), fillna().


In [None]:
import numpy as np
import pandas as pd

name_score_data = {   
        "Name": ['Alice', 'Bob', 'Charlie', 'David', None],
        'Age': [25, np.nan, 30, 32, 29],
        'City': ['New York', None, 'Los Angeles', 'San Diego', 'Chicago'],
        'Score': [85.5, 92.0, np.nan, 88.0, None]
        }

df = pd.DataFrame(name_score_data)
print(df)
print(df.isnull())

df_no_na = df.dropna()
print(df_no_na)

df_filled = df.fillna({
    'Name': 'Unknown', 
    'Age' : df['Age'].mean(),
    'City': 'Unknown City', 
    'Score': df['Score'].median()
})
print(df_filled)

      Name   Age         City  Score
0    Alice  25.0     New York   85.5
1      Bob   NaN         None   92.0
2  Charlie  30.0  Los Angeles    NaN
3    David  32.0    San Diego   88.0
4     None  29.0      Chicago    NaN
    Name    Age   City  Score
0  False  False  False  False
1  False   True   True  False
2  False  False  False   True
3  False  False  False  False
4   True  False  False   True
    Name   Age       City  Score
0  Alice  25.0   New York   85.5
3  David  32.0  San Diego   88.0
      Name   Age          City  Score
0    Alice  25.0      New York   85.5
1      Bob  29.0  Unknown City   92.0
2  Charlie  30.0   Los Angeles   88.0
3    David  32.0     San Diego   88.0
4  Unknown  29.0       Chicago   88.0



Demonstrate forward fill, backward fill, and interpolation for time-indexed data.


In [11]:
import pandas as pd
import numpy as np

date = pd.date_range(start="4/11/2025", end="4/21/2025", freq='D')

data = {
    'Temperature': [30, np.nan, 32, np.nan, np.nan,35, 36, 38, 42, np.nan, 42 ]
}

df = pd.DataFrame(data, index=date)
print(df)

df_ffill = df.ffill()
print(df_ffill)

df_bfill = df.bfill()
print(df_bfill)

df_interp = df.interpolate()
print(df_interp)


            Temperature
2025-04-11         30.0
2025-04-12          NaN
2025-04-13         32.0
2025-04-14          NaN
2025-04-15          NaN
2025-04-16         35.0
2025-04-17         36.0
2025-04-18         38.0
2025-04-19         42.0
2025-04-20          NaN
2025-04-21         42.0
            Temperature
2025-04-11         30.0
2025-04-12         30.0
2025-04-13         32.0
2025-04-14         32.0
2025-04-15         32.0
2025-04-16         35.0
2025-04-17         36.0
2025-04-18         38.0
2025-04-19         42.0
2025-04-20         42.0
2025-04-21         42.0
            Temperature
2025-04-11         30.0
2025-04-12         32.0
2025-04-13         32.0
2025-04-14         35.0
2025-04-15         35.0
2025-04-16         35.0
2025-04-17         36.0
2025-04-18         38.0
2025-04-19         42.0
2025-04-20         42.0
2025-04-21         42.0
            Temperature
2025-04-11         30.0
2025-04-12         31.0
2025-04-13         32.0
2025-04-14         33.0
2025-04-15      


Drop rows where a subset of columns have missing values.


In [12]:
import pandas as pd
import numpy as np

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [25, np.nan, 30, 22, 29],
    'City': ['New York', None, 'Los Angeles', 'Chicago', 'Houston'],
    'Score': [85.5, 92.0, np.nan, 88.0, None]
}
df = pd.DataFrame(data)

df_subset_dropped = df.dropna(subset=['Name', 'Score'])
print(df_subset_dropped)


    Name   Age      City  Score
0  Alice  25.0  New York   85.5
1    Bob   NaN      None   92.0
3  David  22.0   Chicago   88.0



Create a function that uses different fill strategies based on column dtype.


In [None]:
def smart_fill(df):
    filled_copy = df.copy()

    for col in filled_copy:
        if filled_copy[col].dtype == 'object':
            # Object
            filled_copy[col] = filled_copy[col].fillna(filled_copy[col].mode[0])
        elif np.issubdtype(filled_copy[col].dtype, np.number):
            # Numerical Data
            filled_copy[col] = filled_copy[col].fillna(filled_copy[col].median())
        else:
            filled_copy[col] = filled_copy[col].fillna(method='fflll')
    
    return filled_copy


Evaluate how missing data affects statistical summaries and plotting.



In [13]:
# With missing values
print("Original:\n", df.describe())

# After filling
print("\nFilled:\n", df_filled.describe())


Original:
              Age      Score
count   4.000000   3.000000
mean   26.500000  88.500000
std     3.696846   3.278719
min    22.000000  85.500000
25%    24.250000  86.750000
50%    27.000000  88.000000
75%    29.250000  90.000000
max    30.000000  92.000000

Filled:
             Age      Score
count   5.00000   5.000000
mean   29.00000  88.300000
std     2.54951   2.334524
min    25.00000  85.500000
25%    29.00000  88.000000
50%    29.00000  88.000000
75%    30.00000  88.000000
max    32.00000  92.000000


Section – Hierarchical Indexing


Create a MultiIndex from tuples and build a DataFrame indexed by it.



Slice and filter across multiple levels of a MultiIndex.



Swap and sort index levels using .swaplevel() and .sort_index().


Unstack and stack data, then reshape it back to original.



Use .groupby(level=...) on hierarchical data and apply aggregation.