In [3]:
import pandas as pd

# Sample dataset with missing values
data = {
    "id": [1, 2, 3, 4, 5],
    "name": ["Apple", "Banana", "Mango", None, "Orange"],
    "price": [100, 80, None, 60, 120],
    "stock": [50, None, 30, 20, None],
    "category": ["Fruit", "Fruit", None, "Fruit", "Fruit"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,id,name,price,stock,category
0,1,Apple,100.0,50.0,Fruit
1,2,Banana,80.0,,Fruit
2,3,Mango,,30.0,
3,4,,60.0,20.0,Fruit
4,5,Orange,120.0,,Fruit


# 1. isnull() and notnull()
        - isnull() → Detects missing values → returns True for NaN/None
        - notnull() → Opposite → returns True for non-missing values

In [4]:
# Detect missing values
print(df.isnull())

# Count missing values column-wise
print("Count missing values = \n",df.isnull().sum())

# Count non-missing values column-wise
print("Count non-missing values = \n",df.notnull().sum())

      id   name  price  stock  category
0  False  False  False  False     False
1  False  False  False   True     False
2  False  False   True  False      True
3  False   True  False  False     False
4  False  False  False   True     False
Count missing values = 
 id          0
name        1
price       1
stock       2
category    1
dtype: int64
Count non-missing values = 
 id          5
name        4
price       4
stock       3
category    4
dtype: int64


In [5]:
df_copy=df.copy()

# Drop rows with any missing values
df_drop_row = df_copy.dropna()
print(df_drop_row)

# Drop columns with any missing values
df_drop_col = df_copy.dropna(axis=1)
print(df_drop_col)

# Drop rows where ALL values are missing
df_drop_all = df_copy.dropna(how="all")
print(df_drop_all)

   id   name  price  stock category
0   1  Apple  100.0   50.0    Fruit
   id
0   1
1   2
2   3
3   4
4   5
   id    name  price  stock category
0   1   Apple  100.0   50.0    Fruit
1   2  Banana   80.0    NaN    Fruit
2   3   Mango    NaN   30.0     None
3   4    None   60.0   20.0    Fruit
4   5  Orange  120.0    NaN    Fruit


In [6]:
print(df)
df_copy=df.copy()
# Fill with constant value
df_fill_const = df_copy.fillna(0)
print(df_fill_const)

df_copy=df.copy()
# Fill with mean of the column
df_fill_mean = df_copy.copy()
df_fill_mean["price"] = df_fill_mean["price"].fillna(df_fill_mean["price"].mean())
print(df_fill_mean)


# Fill with median
df_fill_median = df.copy()
df_fill_median["stock"] = df_fill_median["stock"].fillna(df_fill_median["stock"].median())
print(df_fill_median)

df_copy=df.copy()
# Forward fill (previous value)
df_ffill = df_copy.ffill()
print(df_ffill)

df_copy=df.copy()
# Backward fill (next value)
df_bfill = df_copy.bfill()
print(df_bfill)


   id    name  price  stock category
0   1   Apple  100.0   50.0    Fruit
1   2  Banana   80.0    NaN    Fruit
2   3   Mango    NaN   30.0     None
3   4    None   60.0   20.0    Fruit
4   5  Orange  120.0    NaN    Fruit
   id    name  price  stock category
0   1   Apple  100.0   50.0    Fruit
1   2  Banana   80.0    0.0    Fruit
2   3   Mango    0.0   30.0        0
3   4       0   60.0   20.0    Fruit
4   5  Orange  120.0    0.0    Fruit
   id    name  price  stock category
0   1   Apple  100.0   50.0    Fruit
1   2  Banana   80.0    NaN    Fruit
2   3   Mango   90.0   30.0     None
3   4    None   60.0   20.0    Fruit
4   5  Orange  120.0    NaN    Fruit
   id    name  price  stock category
0   1   Apple  100.0   50.0    Fruit
1   2  Banana   80.0   30.0    Fruit
2   3   Mango    NaN   30.0     None
3   4    None   60.0   20.0    Fruit
4   5  Orange  120.0   30.0    Fruit
   id    name  price  stock category
0   1   Apple  100.0   50.0    Fruit
1   2  Banana   80.0   50.0    Fruit
2

In [7]:
df = pd.read_csv("C:/Users/ansh/PycharmProjects/python_learning/day_17/Sample_Data/sales_data_sample.csv",encoding="latin1")
df.head()

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,10107,30,95.7,2,2871.0,2/24/2003 0:00,Shipped,1,2,2003,...,897 Long Airport Avenue,,NYC,NY,10022.0,USA,,Yu,Kwai,Small
1,10121,34,81.35,5,2765.9,5/7/2003 0:00,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100.0,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508.0,France,EMEA,Da Cunha,Daniel,Medium
3,10145,45,83.26,6,3746.7,8/25/2003 0:00,Shipped,3,8,2003,...,78934 Hillside Dr.,,Pasadena,CA,90003.0,USA,,Young,Julie,Medium
4,10159,49,100.0,14,5205.27,10/10/2003 0:00,Shipped,4,10,2003,...,7734 Strong St.,,San Francisco,CA,,USA,,Brown,Julie,Medium


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ORDERNUMBER       2823 non-null   int64  
 1   QUANTITYORDERED   2823 non-null   int64  
 2   PRICEEACH         2823 non-null   float64
 3   ORDERLINENUMBER   2823 non-null   int64  
 4   SALES             2823 non-null   float64
 5   ORDERDATE         2823 non-null   object 
 6   STATUS            2823 non-null   object 
 7   QTR_ID            2823 non-null   int64  
 8   MONTH_ID          2823 non-null   int64  
 9   YEAR_ID           2823 non-null   int64  
 10  PRODUCTLINE       2823 non-null   object 
 11  MSRP              2823 non-null   int64  
 12  PRODUCTCODE       2823 non-null   object 
 13  CUSTOMERNAME      2823 non-null   object 
 14  PHONE             2823 non-null   object 
 15  ADDRESSLINE1      2823 non-null   object 
 16  ADDRESSLINE2      302 non-null    object 


# 📝 Practice Questions
    🔹 Easy

    Find total missing values in each column.

    Show all rows where STATE is missing.

    Count how many orders have missing POSTALCODE.

    Check which rows have missing ADDRESSLINE2.

    Show all orders where no data is missing.

In [20]:
# 1.Find total missing values in each column.
df.isnull().sum()

ORDERNUMBER            0
QUANTITYORDERED        0
PRICEEACH              0
ORDERLINENUMBER        0
SALES                  0
ORDERDATE              0
STATUS                 0
QTR_ID                 0
MONTH_ID               0
YEAR_ID                0
PRODUCTLINE            0
MSRP                   0
PRODUCTCODE            0
CUSTOMERNAME           0
PHONE                  0
ADDRESSLINE1           0
ADDRESSLINE2        2521
CITY                   0
STATE               1486
POSTALCODE            76
COUNTRY                0
TERRITORY           1074
CONTACTLASTNAME        0
CONTACTFIRSTNAME       0
DEALSIZE               0
dtype: int64

In [22]:
# 2. Show all rows where STATE is missing.
df[df['STATE'].isnull()]

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
1,10121,34,81.35,5,2765.90,5/7/2003 0:00,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508,France,EMEA,Da Cunha,Daniel,Medium
6,10180,29,86.13,9,2497.77,11/11/2003 0:00,Shipped,4,11,2003,...,"184, chausse de Tournai",,Lille,,59000,France,EMEA,Rance,Martine,Small
7,10188,48,100.00,1,5512.32,11/18/2003 0:00,Shipped,4,11,2003,...,"Drammen 121, PR 744 Sentrum",,Bergen,,N 5804,Norway,EMEA,Oeztan,Veysel,Medium
9,10211,41,100.00,14,4708.44,1/15/2004 0:00,Shipped,1,1,2004,...,"25, rue Lauriston",,Paris,,75016,France,EMEA,Perrier,Dominique,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2816,10327,37,86.74,4,3209.38,11/10/2004 0:00,Resolved,4,11,2004,...,Vinb'ltet 34,,Kobenhavn,,1734,Denmark,EMEA,Petersen,Jytte,Medium
2818,10350,20,100.00,15,2244.40,12/2/2004 0:00,Shipped,4,12,2004,...,"C/ Moralzarzal, 86",,Madrid,,28034,Spain,EMEA,Freyre,Diego,Small
2819,10373,29,100.00,1,3978.51,1/31/2005 0:00,Shipped,1,1,2005,...,Torikatu 38,,Oulu,,90110,Finland,EMEA,Koskitalo,Pirkko,Medium
2820,10386,43,100.00,4,5417.57,3/1/2005 0:00,Resolved,1,3,2005,...,"C/ Moralzarzal, 86",,Madrid,,28034,Spain,EMEA,Freyre,Diego,Medium


In [28]:
# Count how many orders have missing POSTALCODE
df['POSTALCODE'].isnull().value_counts()

POSTALCODE
False    2747
True       76
Name: count, dtype: int64

In [36]:
# Show all orders where no data is missing.
df[df.notnull().all(axis=1)]

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
10,10223,37,100.00,1,3965.66,2/20/2004 0:00,Shipped,1,2,2004,...,636 St Kilda Road,Level 3,Melbourne,Victoria,3004,Australia,APAC,Ferguson,Peter,Medium
21,10361,20,72.55,13,1451.00,12/17/2004 0:00,Shipped,4,12,2004,...,"Monitor Money Building, 815 Pacific Hwy",Level 6,Chatswood,NSW,2067,Australia,APAC,Huxley,Adrian,Small
40,10270,21,100.00,9,4905.39,7/19/2004 0:00,Shipped,3,7,2004,...,"Monitor Money Building, 815 Pacific Hwy",Level 6,Chatswood,NSW,2067,Australia,APAC,Huxley,Adrian,Medium
47,10347,30,100.00,1,3944.70,11/29/2004 0:00,Shipped,4,11,2004,...,636 St Kilda Road,Level 3,Melbourne,Victoria,3004,Australia,APAC,Ferguson,Peter,Medium
51,10391,24,100.00,4,2416.56,3/9/2005 0:00,Shipped,1,3,2005,...,201 Miller Street,Level 15,North Sydney,NSW,2060,Australia,APAC,O'Hara,Anna,Small
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2667,10120,43,76.00,14,3268.00,4/29/2003 0:00,Shipped,2,4,2003,...,636 St Kilda Road,Level 3,Melbourne,Victoria,3004,Australia,APAC,Ferguson,Peter,Medium
2673,10223,26,67.20,15,1747.20,2/20/2004 0:00,Shipped,1,2,2004,...,636 St Kilda Road,Level 3,Melbourne,Victoria,3004,Australia,APAC,Ferguson,Peter,Small
2685,10361,44,100.00,10,5001.92,12/17/2004 0:00,Shipped,4,12,2004,...,"Monitor Money Building, 815 Pacific Hwy",Level 6,Chatswood,NSW,2067,Australia,APAC,Huxley,Adrian,Medium
2764,10361,35,100.00,11,4277.35,12/17/2004 0:00,Shipped,4,12,2004,...,"Monitor Money Building, 815 Pacific Hwy",Level 6,Chatswood,NSW,2067,Australia,APAC,Huxley,Adrian,Medium


    🔹 Medium (5)

    Drop all rows where CUSTOMERNAME is missing.

    Drop columns which contain any missing values.

    Fill missing POSTALCODE with "UNKNOWN".

    Fill missing QUANTITYORDERED with column mean.

    Forward fill (ffill) missing STATE values.


In [10]:
# 1. Drop all rows where TERRITORY is missing.
df_copy=df.copy()
df_copy.dropna(subset='TERRITORY')

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
1,10121,34,81.35,5,2765.90,5/7/2003 0:00,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508,France,EMEA,Da Cunha,Daniel,Medium
6,10180,29,86.13,9,2497.77,11/11/2003 0:00,Shipped,4,11,2003,...,"184, chausse de Tournai",,Lille,,59000,France,EMEA,Rance,Martine,Small
7,10188,48,100.00,1,5512.32,11/18/2003 0:00,Shipped,4,11,2003,...,"Drammen 121, PR 744 Sentrum",,Bergen,,N 5804,Norway,EMEA,Oeztan,Veysel,Medium
9,10211,41,100.00,14,4708.44,1/15/2004 0:00,Shipped,1,1,2004,...,"25, rue Lauriston",,Paris,,75016,France,EMEA,Perrier,Dominique,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2816,10327,37,86.74,4,3209.38,11/10/2004 0:00,Resolved,4,11,2004,...,Vinb'ltet 34,,Kobenhavn,,1734,Denmark,EMEA,Petersen,Jytte,Medium
2818,10350,20,100.00,15,2244.40,12/2/2004 0:00,Shipped,4,12,2004,...,"C/ Moralzarzal, 86",,Madrid,,28034,Spain,EMEA,Freyre,Diego,Small
2819,10373,29,100.00,1,3978.51,1/31/2005 0:00,Shipped,1,1,2005,...,Torikatu 38,,Oulu,,90110,Finland,EMEA,Koskitalo,Pirkko,Medium
2820,10386,43,100.00,4,5417.57,3/1/2005 0:00,Resolved,1,3,2005,...,"C/ Moralzarzal, 86",,Madrid,,28034,Spain,EMEA,Freyre,Diego,Medium


In [11]:
# 2. Drop columns which contain any missing values.
df.dropna(axis='columns').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ORDERNUMBER       2823 non-null   int64  
 1   QUANTITYORDERED   2823 non-null   int64  
 2   PRICEEACH         2823 non-null   float64
 3   ORDERLINENUMBER   2823 non-null   int64  
 4   SALES             2823 non-null   float64
 5   ORDERDATE         2823 non-null   object 
 6   STATUS            2823 non-null   object 
 7   QTR_ID            2823 non-null   int64  
 8   MONTH_ID          2823 non-null   int64  
 9   YEAR_ID           2823 non-null   int64  
 10  PRODUCTLINE       2823 non-null   object 
 11  MSRP              2823 non-null   int64  
 12  PRODUCTCODE       2823 non-null   object 
 13  CUSTOMERNAME      2823 non-null   object 
 14  PHONE             2823 non-null   object 
 15  ADDRESSLINE1      2823 non-null   object 
 16  CITY              2823 non-null   object 


In [12]:
# 3. Fill missing POSTALCODE with "UNKNOWN".
df_copy['POSTALCODE'].fillna(value="UNKNOWN")

0         10022
1         51100
2         75508
3         90003
4       UNKNOWN
         ...   
2818      28034
2819      90110
2820      28034
2821      31000
2822      51003
Name: POSTALCODE, Length: 2823, dtype: object

In [13]:
# 4. Fill missing QUANTITYORDERED with column mean
df['QUANTITYORDERED'].fillna(value=df['QUANTITYORDERED'].mean())

0       30
1       34
2       41
3       45
4       49
        ..
2818    20
2819    29
2820    43
2821    34
2822    47
Name: QUANTITYORDERED, Length: 2823, dtype: int64

In [14]:
# 5. Forward fill (ffill) missing STATE values.
df_copy['STATE'].ffill()

0       NY
1       NY
2       NY
3       CA
4       CA
        ..
2818    NY
2819    NY
2820    NY
2821    NY
2822    MA
Name: STATE, Length: 2823, dtype: object


    🔹 Hard (5)

    Fill missing SALES with QUANTITYORDERED * PRICEEACH.

    Fill missing DEALSIZE with "Medium".

    Replace missing MSRP with column median.

    Backward fill (bfill) missing CITY values.

    Drop rows where both PRICEEACH and SALES are missing.

In [15]:
# 1. Fill missing SALES with QUANTITYORDERED * PRICEEACH.
df_copy=df.copy()
df_copy['SALES'].fillna(value=df_copy['QUANTITYORDERED'] * df_copy['PRICEEACH'])

0       2871.00
1       2765.90
2       3884.34
3       3746.70
4       5205.27
         ...   
2818    2244.40
2819    3978.51
2820    5417.57
2821    2116.16
2822    3079.44
Name: SALES, Length: 2823, dtype: float64

In [16]:
# 2. Fill missing DEALSIZE with "Medium".
df_copy['DEALSIZE'].fillna(value="Medium")

0        Small
1        Small
2       Medium
3       Medium
4       Medium
         ...  
2818     Small
2819    Medium
2820    Medium
2821     Small
2822    Medium
Name: DEALSIZE, Length: 2823, dtype: object

In [17]:
# 3. Replace missing MSRP with column median.
import numpy as np
df_copy['MSRP'].fillna(value=np.median(df['MSRP']))

0       95
1       95
2       95
3       95
4       95
        ..
2818    54
2819    54
2820    54
2821    54
2822    54
Name: MSRP, Length: 2823, dtype: int64

In [18]:
# 4. Backward fill (bfill) missing CITY values.
df_copy['CITY'].bfill()

0                 NYC
1               Reims
2               Paris
3            Pasadena
4       San Francisco
            ...      
2818           Madrid
2819             Oulu
2820           Madrid
2821         Toulouse
2822           Boston
Name: CITY, Length: 2823, dtype: object

In [19]:
# 5. Drop rows where both PRICEEACH or STATE missing.
df_copy.dropna(how="any",subset=['PRICEEACH','STATE']).info()

<class 'pandas.core.frame.DataFrame'>
Index: 1337 entries, 0 to 2822
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ORDERNUMBER       1337 non-null   int64  
 1   QUANTITYORDERED   1337 non-null   int64  
 2   PRICEEACH         1337 non-null   float64
 3   ORDERLINENUMBER   1337 non-null   int64  
 4   SALES             1337 non-null   float64
 5   ORDERDATE         1337 non-null   object 
 6   STATUS            1337 non-null   object 
 7   QTR_ID            1337 non-null   int64  
 8   MONTH_ID          1337 non-null   int64  
 9   YEAR_ID           1337 non-null   int64  
 10  PRODUCTLINE       1337 non-null   object 
 11  MSRP              1337 non-null   int64  
 12  PRODUCTCODE       1337 non-null   object 
 13  CUSTOMERNAME      1337 non-null   object 
 14  PHONE             1337 non-null   object 
 15  ADDRESSLINE1      1337 non-null   object 
 16  ADDRESSLINE2      250 non-null    object 
 17  