# Data Munging Basics
## Treating missing values

In [1]:
import pandas as pd
import numpy as np

from pandas import DataFrame, Series

### Figuring out what data is missing

In [7]:
# np.nan can be used as a replacement for empty values
series_obj = Series(['row 1', 'row 2', 'row 3', np.nan, 'row 5', 'row 6', np.nan])
series_obj

0    row 1
1    row 2
2    row 3
3      NaN
4    row 5
5    row 6
6      NaN
dtype: object

In [11]:
# returns a series object stating in the values are null or not
series_obj.isnull()

0    False
1    False
2    False
3     True
4    False
5    False
6     True
dtype: bool

### Filling in for missing values

In [13]:
np.random.seed(25)
df_object = DataFrame(np.random.randn(36).reshape(6,6))
df_object

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.232312
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-1.049942
3,1.05661,-0.419678,2.294842,-2.594487,2.822756,0.680889
4,-1.577693,-1.976254,0.53334,-0.29087,-0.51352,1.982626
5,0.226001,-1.839905,1.607671,0.388292,0.399732,0.405477


In [27]:
# Set column 0 of rows 3-5 equal to NaN
df_object.loc[3:5, [0, 2]] = np.nan
df_object

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.232312
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-1.049942
3,,-0.419678,,-2.594487,2.822756,0.680889
4,,-1.976254,,-0.29087,-0.51352,1.982626
5,,-1.839905,,0.388292,0.399732,0.405477


In [28]:
# Fill all NaN values with 0 - this diesn't change the original DF
df_object.fillna(0)

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.232312
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-1.049942
3,0.0,-0.419678,0.0,-2.594487,2.822756,0.680889
4,0.0,-1.976254,0.0,-0.29087,-0.51352,1.982626
5,0.0,-1.839905,0.0,0.388292,0.399732,0.405477


In [30]:
# A Dict can be passed to the fillna() method which is in the format of {column:value}
df_object.fillna({0:1, 2:45})

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.232312
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-1.049942
3,1.0,-0.419678,45.0,-2.594487,2.822756,0.680889
4,1.0,-1.976254,45.0,-0.29087,-0.51352,1.982626
5,1.0,-1.839905,45.0,0.388292,0.399732,0.405477


In [33]:
# Setting the key word 'method' argument to 'ffill' (stands for 'fill forward')
# will set all NaN values to the last non null value for that column
df_object.fillna(method='ffill')

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.232312
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-1.049942
3,2.152957,-0.419678,0.07638,-2.594487,2.822756,0.680889
4,2.152957,-1.976254,0.07638,-0.29087,-0.51352,1.982626
5,2.152957,-1.839905,0.07638,0.388292,0.399732,0.405477


### Counting missing values

In [40]:
# Counts the number of null values in each column and returns a series object in the format of [column:count]
df_object.isnull().sum()

0    3
1    0
2    3
3    0
4    0
5    0
dtype: int64

### Filtering out missing values

In [45]:
# Drop all rows that contain null values
df_object.dropna()

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.232312
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-1.049942


In [47]:
# Drop only the rows that contain all null values
df_object.dropna(how='all')

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.232312
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-1.049942
3,,-0.419678,,-2.594487,2.822756,0.680889
4,,-1.976254,,-0.29087,-0.51352,1.982626
5,,-1.839905,,0.388292,0.399732,0.405477
