# Pandas

In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np


## Filtering out Missing data

### dropna

In [2]:
from numpy import nan as NA

In [3]:
data = Series([1, NA, 3.5, NA, 7])

In [4]:
# = data[data.notnull()]

data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data =DataFrame([[1., 5.6, 3.], [1, NA, NA], 
                   [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,5.6,3.0
1,1.0,,
2,,,
3,,6.5,3.0


### how='all'
solo las filas con todas las entradas iguales a NA

In [13]:
data.dropna(how = 'all')

Unnamed: 0,0,1,2
0,1.0,5.6,3.0
1,1.0,,
3,,6.5,3.0


In [14]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,5.6,3.0


In [15]:
# Droping columns

data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,5.6,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [16]:
data.dropna(axis =1, how='all')

Unnamed: 0,0,1,2
0,1.0,5.6,3.0
1,1.0,,
2,,,
3,,6.5,3.0


### thresh
keep rows containing a certain number of observations

In [18]:
df = DataFrame(np.random.rand(7, 3))
df

Unnamed: 0,0,1,2
0,0.403989,0.381908,0.850146
1,0.681034,0.255539,0.707488
2,0.99275,0.430531,0.517943
3,0.796843,0.68809,0.833332
4,0.628024,0.184821,0.661435
5,0.47228,0.166759,0.178144
6,0.218249,0.818273,0.992288


In [19]:
df.loc[:4, 1] = NA; df.loc[:2, 2] = NA

In [20]:
df

Unnamed: 0,0,1,2
0,0.403989,,
1,0.681034,,
2,0.99275,,
3,0.796843,,0.833332
4,0.628024,,0.661435
5,0.47228,0.166759,0.178144
6,0.218249,0.818273,0.992288


In [21]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
3,0.796843,,0.833332
4,0.628024,,0.661435
5,0.47228,0.166759,0.178144
6,0.218249,0.818273,0.992288


## Filling Missing Data

### fillna

In [22]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.403989,0.0,0.0
1,0.681034,0.0,0.0
2,0.99275,0.0,0.0
3,0.796843,0.0,0.833332
4,0.628024,0.0,0.661435
5,0.47228,0.166759,0.178144
6,0.218249,0.818273,0.992288


In [29]:
# use a different fill value for each column

df.fillna({1: 0.5, 3: -1})

Unnamed: 0,0,1,2
0,0.571867,0.5,
1,0.687845,0.5,
2,0.998269,0.5,
3,0.032319,0.5,0.534793
4,0.67423,0.5,0.921974
5,0.819432,0.417009,0.265412
6,0.391873,0.667832,0.675651


In [32]:
# fillna returns a new object, but you can 
#modify the existing object

_ = df.fillna(0, inplace=True)

In [33]:
df

Unnamed: 0,0,1,2
0,0.571867,0.0,0.0
1,0.687845,0.0,0.0
2,0.998269,0.0,0.0
3,0.032319,0.0,0.534793
4,0.67423,0.0,0.921974
5,0.819432,0.417009,0.265412
6,0.391873,0.667832,0.675651


### interpolation methods

In [36]:
df = DataFrame(np.random.rand(6, 3))
df.ix[2:,1] = NA; df.ix[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.44003,0.942909,0.3907
1,0.338224,0.657735,0.457466
2,0.59611,,0.124546
3,0.585392,,0.69231
4,0.153977,,
5,0.775935,,


In [37]:
df.fillna(method ='ffill')

Unnamed: 0,0,1,2
0,0.44003,0.942909,0.3907
1,0.338224,0.657735,0.457466
2,0.59611,0.657735,0.124546
3,0.585392,0.657735,0.69231
4,0.153977,0.657735,0.69231
5,0.775935,0.657735,0.69231


In [38]:
df.fillna(method ='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.44003,0.942909,0.3907
1,0.338224,0.657735,0.457466
2,0.59611,0.657735,0.124546
3,0.585392,0.657735,0.69231
4,0.153977,,0.69231
5,0.775935,,0.69231


In [23]:
data = Series([1, NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64