### 01 percentage null value in each col

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = {
    'size': ['XL','L','M',np.nan, 'M','M'],
    'color': ['red','green','blue','green','red','green'],
    'gender': ['female','male',np.nan,'female','female','male'],
    'price': [199.0,89.0,np.nan,129.0,79.0,89.0],
    'weight': [500,450,300,np.nan,410,np.nan],
    'bought': ['yes','no','yes','no','yes','no']
}

In [4]:
df = pd.DataFrame(data)
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [5]:
df.isnull().sum()

size      1
color     0
gender    1
price     1
weight    2
bought    0
dtype: int64

In [6]:
len(df)

6

In [7]:
round(df.isnull().sum()/len(df),2)

size      0.17
color     0.00
gender    0.17
price     0.17
weight    0.33
bought    0.00
dtype: float64

### 02 fill missing value with mean

In [8]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

df[['weight']] = imputer.fit_transform(df[['weight']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


### 03 which number is filled

In [9]:
imputer.statistics_[0]

415.0

### 04 fill missing value with constant

In [10]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=99.0)

df[['price']] = imputer.fit_transform(df[['price']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,99.0,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [11]:
imputer.statistics_[0]

99.0

### 05 fill missing value with the most frequently occur value

In [12]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

df[['size']] = imputer.fit_transform(df[['size']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,99.0,300.0,yes
3,M,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


### 06 filter not null value & mean of number cols

In [13]:
data = {
    'size': ['XL','L','M',np.nan, 'M','M'],
    'color': ['red','green','blue','green','red','green'],
    'gender': ['female','male',np.nan,'female','female','male'],
    'price': [199.0,89.0,np.nan,129.0,79.0,89.0],
    'weight': [500,450,300,np.nan,410,np.nan],
    'bought': ['yes','no','yes','no','yes','no']
}
df = pd.DataFrame(data)
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [14]:
df[~df['weight'].isnull()] # ~ means reverse

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
4,M,red,female,79.0,410.0,yes


In [16]:
df[~df['weight'].isnull()].select_dtypes(include=['float']).mean()

price     122.333333
weight    415.000000
dtype: float64

### 07 filter object cols & fill

In [18]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='empty')
cols = df.select_dtypes(include=['object']).columns
cols

Index(['size', 'color', 'gender', 'bought'], dtype='object')

In [25]:
# df[cols] = imputer.fit_transform(df[cols])
df.loc[:,cols] = imputer.fit_transform(df[cols])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,empty,,300.0,yes
3,empty,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no
