## Handling Missing Data

In [4]:
import numpy as np
import pandas as pd

In [5]:
data_dic = {'A':[1,2,np.nan,4,np.nan],
            'B':[np.nan,np.nan,np.nan,np.nan,np.nan],
            'C':[11,12,13,14,15],
            'D':[16,np.nan,18,19,20]}
df = pd.DataFrame(data_dic)
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0



**isnull(), isna(), notnull() -- Check for missing data in the dataset!**

In [6]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,True,False,False
1,False,True,False,True
2,True,True,False,False
3,False,True,False,False
4,True,True,False,False


In [7]:
df.isnull().sum()

A    2
B    5
C    0
D    1
dtype: int64

In [8]:
df.isnull().sum().sum()

8

In [9]:
df['A'].isnull()

0    False
1    False
2     True
3    False
4     True
Name: A, dtype: bool

In [10]:
df['A'].isnull().sum()

2

In [11]:
df.isna()

Unnamed: 0,A,B,C,D
0,False,True,False,False
1,False,True,False,True
2,True,True,False,False
3,False,True,False,False
4,True,True,False,False


In [12]:
df.isna().sum().sum()

8

In [13]:
df.loc[1].isnull().sum()

2

In [14]:
df.notnull()

Unnamed: 0,A,B,C,D
0,True,False,True,True
1,True,False,True,False
2,False,False,True,True
3,True,False,True,True
4,False,False,True,True


In [15]:
df.shape

(5, 4)

In [16]:
df.notnull().sum()

A    3
B    0
C    5
D    4
dtype: int64

In [17]:
df.notnull().sum().sum()

12

In [18]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [19]:
# Sum on Column "A", (NaN as 0)
df['A'].sum()

7.0

&#9758; NaN ignored for mean().

In [20]:
df['A'].mean()

2.3333333333333335

In [21]:
df.loc[3].sum()

37.0

**dropna(), fillna() -- Cleaning / filling the missing data**

In [22]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D


In [23]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [24]:
# for column, need to tell axis = 1
df.dropna(axis=1)

Unnamed: 0,C
0,11
1,12
2,13
3,14
4,15


In [25]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


thresh : int, default None
thresh = 3 means, it will drop any column that have less than 3 non-NaN values.

In [26]:
df.dropna(thresh=3, axis=1)

Unnamed: 0,A,C,D
0,1.0,11,16.0
1,2.0,12,
2,,13,18.0
3,4.0,14,19.0
4,,15,20.0


In [27]:
df.fillna(value=2)

Unnamed: 0,A,B,C,D
0,1.0,2.0,11,16.0
1,2.0,2.0,12,2.0
2,2.0,2.0,13,18.0
3,4.0,2.0,14,19.0
4,2.0,2.0,15,20.0


In [28]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


Let's fill in the values using mean of the column.

In [29]:
df['A'].fillna(value = df['A'].mean())

0    1.000000
1    2.000000
2    2.333333
3    4.000000
4    2.333333
Name: A, dtype: float64

In [30]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [31]:
df.fillna(method='ffill')

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,16.0
2,2.0,,13,18.0
3,4.0,,14,19.0
4,4.0,,15,20.0


In [32]:
df.fillna(method='bfill')

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,18.0
2,4.0,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [33]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,1.0,0.0,11,16.0
1,2.0,0.0,12,0.0
2,0.0,0.0,13,18.0
3,4.0,0.0,14,19.0
4,0.0,0.0,15,20.0


In [34]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [35]:
from sklearn.impute import SimpleImputer

In [36]:
df2 = df.copy()
df2

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [37]:
imputer = SimpleImputer(strategy='constant', fill_value= -1)
df2['A'] = imputer.fit_transform(df2[['A']])
df2

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,-1.0,,13,18.0
3,4.0,,14,19.0
4,-1.0,,15,20.0


In [38]:
df2 = df.copy()
imputer = SimpleImputer(strategy='mean')
df2['A'] = imputer.fit_transform(df2[['A']])
df2

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,2.333333,,13,18.0
3,4.0,,14,19.0
4,2.333333,,15,20.0


In [39]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [40]:
df2 = df.copy()
imputer = SimpleImputer(strategy='median')
df2['A'] = imputer.fit_transform(df2[['A']])
df2

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,2.0,,13,18.0
3,4.0,,14,19.0
4,2.0,,15,20.0


In [41]:
data_dic = {'A':[1,2,np.nan,4,np.nan],
            'B':[np.nan,np.nan,np.nan,np.nan,np.nan],
            'C':[11,12,13,14,15],
            'D':[16,np.nan,18,19,18]}
df2 = pd.DataFrame(data_dic)
df2

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,18.0


In [42]:
imputer = SimpleImputer(strategy='most_frequent')
df2['D'] = imputer.fit_transform(df2[['D']])
df2

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,18.0
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,18.0


In [43]:
from sklearn.impute import KNNImputer

In [44]:
df2 = df.copy()
df2

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [45]:
knn_imputer = KNNImputer(n_neighbors=3, weights="uniform")
df2 = knn_imputer.fit_transform(df2)
df2

array([[ 1.        , 11.        , 16.        ],
       [ 2.        , 12.        , 17.66666667],
       [ 2.33333333, 13.        , 18.        ],
       [ 4.        , 14.        , 19.        ],
       [ 2.33333333, 15.        , 20.        ]])

In [46]:
knn_imputer = KNNImputer(n_neighbors=3, weights="distance")
df2 = knn_imputer.fit_transform(df2)
df2

array([[ 1.        , 11.        , 16.        ],
       [ 2.        , 12.        , 17.66666667],
       [ 2.33333333, 13.        , 18.        ],
       [ 4.        , 14.        , 19.        ],
       [ 2.33333333, 15.        , 20.        ]])

In [47]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [48]:
# fill with you own given value
df.fillna(0, inplace=True)

In [49]:
df

Unnamed: 0,A,B,C,D
0,1.0,0.0,11,16.0
1,2.0,0.0,12,0.0
2,0.0,0.0,13,18.0
3,4.0,0.0,14,19.0
4,0.0,0.0,15,20.0
