In [1]:
import pandas as pd
import numpy as np

In [2]:
string_data = pd.Series(['aardvark','artichoke',np.nan,'avacado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avacado
dtype: object

In [3]:
string_data[0]=None
string_data

0         None
1    artichoke
2          NaN
3      avacado
dtype: object

In [4]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

# Filtering Out Missing Data

In [5]:
data = pd.Series([1,np.nan,3.5,np.nan,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [6]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
data = pd.DataFrame([[1,6.5,3],[1,np.nan,np.nan],[np.nan,np.nan,np.nan],[np.nan,6.5,3]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [9]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [11]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [14]:
data.dropna(axis='columns')

0
1
2
3


In [15]:
data[4]=np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [17]:
data.dropna(axis=1,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,0.049953,-2.416611,1.34281
1,0.705383,-0.401429,-0.851785
2,0.395402,-3.120526,-0.375263
3,-1.574417,-0.847011,-0.189355
4,-1.09574,2.031274,-0.674486
5,-0.305869,0.500576,0.372614
6,-0.898648,0.058904,-0.472863


In [20]:
df.iloc[:4,1]=np.nan
df.iloc[:2,2]=np.nan
df

Unnamed: 0,0,1,2
0,0.049953,,
1,0.705383,,
2,0.395402,,-0.375263
3,-1.574417,,-0.189355
4,-1.09574,2.031274,-0.674486
5,-0.305869,0.500576,0.372614
6,-0.898648,0.058904,-0.472863


In [21]:
df.dropna()

Unnamed: 0,0,1,2
4,-1.09574,2.031274,-0.674486
5,-0.305869,0.500576,0.372614
6,-0.898648,0.058904,-0.472863


In [22]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.395402,,-0.375263
3,-1.574417,,-0.189355
4,-1.09574,2.031274,-0.674486
5,-0.305869,0.500576,0.372614
6,-0.898648,0.058904,-0.472863


In [31]:
df.dropna(thresh=2,axis=1) # "thresh" is not working to remove columns

Unnamed: 0,0,1,2
0,0.049953,,
1,0.705383,,
2,0.395402,,-0.375263
3,-1.574417,,-0.189355
4,-1.09574,2.031274,-0.674486
5,-0.305869,0.500576,0.372614
6,-0.898648,0.058904,-0.472863


# Filling Missing Data

In [32]:
df

Unnamed: 0,0,1,2
0,0.049953,,
1,0.705383,,
2,0.395402,,-0.375263
3,-1.574417,,-0.189355
4,-1.09574,2.031274,-0.674486
5,-0.305869,0.500576,0.372614
6,-0.898648,0.058904,-0.472863


In [33]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.049953,0.0,0.0
1,0.705383,0.0,0.0
2,0.395402,0.0,-0.375263
3,-1.574417,0.0,-0.189355
4,-1.09574,2.031274,-0.674486
5,-0.305869,0.500576,0.372614
6,-0.898648,0.058904,-0.472863


In [34]:
df.fillna({1:0.5,2:-1})

Unnamed: 0,0,1,2
0,0.049953,0.5,-1.0
1,0.705383,0.5,-1.0
2,0.395402,0.5,-0.375263
3,-1.574417,0.5,-0.189355
4,-1.09574,2.031274,-0.674486
5,-0.305869,0.500576,0.372614
6,-0.898648,0.058904,-0.472863


In [35]:
df.fillna(0,inplace=True)

In [36]:
df

Unnamed: 0,0,1,2
0,0.049953,0.0,0.0
1,0.705383,0.0,0.0
2,0.395402,0.0,-0.375263
3,-1.574417,0.0,-0.189355
4,-1.09574,2.031274,-0.674486
5,-0.305869,0.500576,0.372614
6,-0.898648,0.058904,-0.472863


In [38]:
df = pd.DataFrame(np.random.randn(6,3))
df.iloc[2:,1]=np.nan
df.iloc[4:,2]=np.nan
df

Unnamed: 0,0,1,2
0,1.116467,1.486893,-1.849539
1,0.805761,0.014923,0.329204
2,0.498605,,0.650945
3,-0.484249,,1.177936
4,-0.757612,,
5,-0.608702,,


In [40]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.116467,1.486893,-1.849539
1,0.805761,0.014923,0.329204
2,0.498605,0.014923,0.650945
3,-0.484249,0.014923,1.177936
4,-0.757612,0.014923,1.177936
5,-0.608702,0.014923,1.177936


In [41]:
df.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,1.116467,1.486893,-1.849539
1,0.805761,0.014923,0.329204
2,0.498605,0.014923,0.650945
3,-0.484249,0.014923,1.177936
4,-0.757612,,1.177936
5,-0.608702,,1.177936


In [42]:
data = pd.Series([1,np.nan,3.5,np.nan,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [43]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64