In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

small_dataset = pd.DataFrame({'col1': [1, 2, np.nan, np.nan, 5, 6], 
                              'col2': [7, 8, np.nan, 10, 11, 12],
                              'col3': [np.nan, 14, np.nan, 16, 17, 18]})

small_dataset

Unnamed: 0,col1,col2,col3
0,1.0,7.0,
1,2.0,8.0,14.0
2,,,
3,,10.0,16.0
4,5.0,11.0,17.0
5,6.0,12.0,18.0


### Drop any row with missing value

In [3]:
all_drop  = small_dataset.dropna()# Drop any row with a missing value


#print result
all_drop

Unnamed: 0,col1,col2,col3
1,2.0,8.0,14.0
4,5.0,11.0,17.0
5,6.0,12.0,18.0


### Drop only the row with all missing values

In [4]:
all_row =small_dataset.dropna(how='all') # Drop only rows with all missing values 


#print result
all_row

Unnamed: 0,col1,col2,col3
0,1.0,7.0,
1,2.0,8.0,14.0
3,,10.0,16.0
4,5.0,11.0,17.0
5,6.0,12.0,18.0


### Drop only the rows with missing value in column 3

In [5]:
only3_drop = small_dataset.dropna(subset=['col3'])# Drop only rows with missing values in column 3


#print result
only3_drop

Unnamed: 0,col1,col2,col3
1,2.0,8.0,14.0
3,,10.0,16.0
4,5.0,11.0,17.0
5,6.0,12.0,18.0


### Drop only the rows with missing values in column 3 or column 1

In [6]:
only3or1_drop = small_dataset.dropna(subset=['col3','col1'])# Drop rows with missing values in column 1 or column 3


#print result
only3or1_drop

Unnamed: 0,col1,col2,col3
1,2.0,8.0,14.0
4,5.0,11.0,17.0
5,6.0,12.0,18.0


### Imputation Methods and Resources

One of the most common methods for working with missing values is by imputing the missing values.  Imputation means that you input a value for values that were originally missing. 

It is very common to impute in the following ways:
1. Impute the **mean** of a column.<br><br>

2. If you are working with categorical data or a variable with outliers, then use the **mode** of the column.<br><br>

3. Impute 0, a very small number, or a very large number to differentiate missing values from other values.<br><br>

4. Use KNN to impute values based on features that are most similar.<br><br>

In [7]:
df = pd.DataFrame({'A':[np.nan, 2, np.nan, 0, 7, 10, 15],
                   'B':[3, 4, 5, 1, 2, 3, 5],
                   'C':[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
                   'D':[np.nan, True, np.nan, False, True, False, np.nan],
                   'E':['Yes', 'No', 'Maybe', np.nan, np.nan, 'Yes', np.nan]})

df

Unnamed: 0,A,B,C,D,E
0,,3,,,Yes
1,2.0,4,,True,No
2,,5,,,Maybe
3,0.0,1,,False,
4,7.0,2,,True,
5,10.0,3,,False,Yes
6,15.0,5,,,


In [None]:
# Since column C is all nan, we can drop it
# In general, we can drop any column that is all nan or all same values

In [8]:
df.drop(columns = 'C',inplace=True)

In [9]:
df

Unnamed: 0,A,B,D,E
0,,3,,Yes
1,2.0,4,True,No
2,,5,,Maybe
3,0.0,1,False,
4,7.0,2,True,
5,10.0,3,False,Yes
6,15.0,5,,


In [12]:
# Null values in numerical columns can be filled with mean
# Lambda function and apply method
fill_mean = lambda col: col.fillna(col.mean())
df[['A','B']]=df[['A','B']].apply(fill_mean, axis=0)

In [13]:
df

Unnamed: 0,A,B,D,E
0,6.8,3,,Yes
1,2.0,4,True,No
2,6.8,5,,Maybe
3,0.0,1,False,
4,7.0,2,True,
5,10.0,3,False,Yes
6,15.0,5,,


In [14]:
# For categorical, we use mode
fill_mode = lambda col: col.fillna(col.mode()[0]) #We take the first mode only incase of multimodal distribution for this example
df[['D','E']]=df[['D','E']].apply(fill_mode, axis = 0)

In [15]:
df

Unnamed: 0,A,B,D,E
0,6.8,3,False,Yes
1,2.0,4,True,No
2,6.8,5,False,Maybe
3,0.0,1,False,Yes
4,7.0,2,True,Yes
5,10.0,3,False,Yes
6,15.0,5,False,Yes


These methods can be a great first step to get your models off the ground, but there are potentially detrimental aspects to the bias introduced into your models using these methods.