In [3]:
#Missing Data
import pandas as pd
df = pd.DataFrame([[1, 2, None, 3], [4, None, 5, 6],
[7, 8, 9, 10], [None, None, None, None]])
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,,3.0
1,4.0,,5.0,6.0
2,7.0,8.0,9.0,10.0
3,,,,


In [4]:
df.fillna(0)

Unnamed: 0,0,1,2,3
0,1.0,2.0,0.0,3.0
1,4.0,0.0,5.0,6.0
2,7.0,8.0,9.0,10.0
3,0.0,0.0,0.0,0.0


In [6]:
df.fillna(method='pad') # this is equivalent to both method='ffill' and .ffill()

Unnamed: 0,0,1,2,3
0,1.0,2.0,,3.0
1,4.0,2.0,5.0,6.0
2,7.0,8.0,9.0,10.0
3,7.0,8.0,9.0,10.0


In [7]:
df.fillna(method='pad') # this is equivalent to both method='ffill' and .ffill()

Unnamed: 0,0,1,2,3
0,1.0,2.0,,3.0
1,4.0,2.0,5.0,6.0
2,7.0,8.0,9.0,10.0
3,7.0,8.0,9.0,10.0


In [9]:
#Fill using another DataFrame
import numpy as np
df2 = pd.DataFrame(np.arange(100, 116).reshape(4, 4))
df2

Unnamed: 0,0,1,2,3
0,100,101,102,103
1,104,105,106,107
2,108,109,110,111
3,112,113,114,115


In [10]:
df.fillna(df2) # takes the corresponding cells in df2 to fill df

Unnamed: 0,0,1,2,3
0,1.0,2.0,102.0,3.0
1,4.0,105.0,5.0,6.0
2,7.0,8.0,9.0,10.0
3,112.0,113.0,114.0,115.0


In [12]:
#Dropping missing values
df = pd.DataFrame([[1, 2, None, 3], [4, None, 5, 6],
[7, 8, 9, 10], [None, None, None, None]])
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,,3.0
1,4.0,,5.0,6.0
2,7.0,8.0,9.0,10.0
3,,,,


In [13]:
df.dropna()


Unnamed: 0,0,1,2,3
2,7.0,8.0,9.0,10.0


In [15]:
#Drop rows if all values in that row are missing
df.dropna(how='all')

Unnamed: 0,0,1,2,3
0,1.0,2.0,,3.0
1,4.0,,5.0,6.0
2,7.0,8.0,9.0,10.0


In [18]:
#Interpolation
df = pd.DataFrame({'A':[1,2,np.nan,3,np.nan],
 'B':[1.2,7,3,0,8]})
df['C'] = df.A.interpolate()
df

Unnamed: 0,A,B,C
0,1.0,1.2,1.0
1,2.0,7.0,2.0
2,,3.0,2.5
3,3.0,0.0,3.0
4,,8.0,3.0


In [20]:
df['D'] = df.A.interpolate(method='linear', order=1)
df

Unnamed: 0,A,B,C,D
0,1.0,1.2,1.0,1.0
1,2.0,7.0,2.0,2.0
2,,3.0,2.5,2.5
3,3.0,0.0,3.0,3.0
4,,8.0,3.0,3.0


In [22]:
#Checking for missing values
ser = pd.Series([1, 2, np.nan, 4])
pd.isnull(ser)


0    False
1    False
2     True
3    False
dtype: bool

In [23]:
# False so you should avoid comparison against np.nan
ser == np.nan

0    False
1    False
2    False
3    False
dtype: bool

In [24]:
ser.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [25]:
ser.notnull() 

0     True
1     True
2    False
3     True
dtype: bool

In [38]:
#MultiIndex
df = pd.DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
0,1.536298,2.376138,0.070923
1,-2.154933,-1.409536,0.300088
2,1.117405,-0.18248,1.220598
3,0.70405,0.121536,-0.062144
4,-0.667707,-0.231293,-0.007683
5,-0.747939,2.203363,0.76638


In [40]:
df.set_index(['A','B'], inplace=True)


In [41]:
print(df.columns)


Index(['C'], dtype='object')


In [42]:
df.index.get_level_values('A')

Float64Index([  1.536298177248333,  -2.154933430993559,  1.1174054757510847,
               0.7040502843123734, -0.6677066545395384,  -0.747938653242027],
             dtype='float64', name='A')

In [43]:
df.index.get_level_values(level=1)


Float64Index([   2.376138232646186,  -1.4095356988359553, -0.18247992121939566,
               0.12153628422823842, -0.23129302292532186,
                2.2033634315330257],
             dtype='float64', name='B')

In [44]:
df.loc[(df.index.get_level_values('A') > 0.5) & (df.index.get_level_values('A') <
2.1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,C
A,B,Unnamed: 2_level_1
1.536298,2.376138,0.070923
1.117405,-0.18248,1.220598
0.70405,0.121536,-0.062144


In [47]:
#Iterate over DataFrame with MultiIndex
df = pd.DataFrame({'a':[1,1,1,2,2,3],'b':[4,4,5,5,6,7,],'c':[10,11,12,13,14,15]})
df

Unnamed: 0,a,b,c
0,1,4,10
1,1,4,11
2,1,5,12
3,2,5,13
4,2,6,14
5,3,7,15


In [49]:
df.set_index(['a','b'], inplace=True)

KeyError: "None of ['a', 'b'] are in the columns"

In [50]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,c
a,b,Unnamed: 2_level_1
1,4,10
1,4,11
1,5,12
2,5,13
2,6,14
3,7,15


In [51]:
for idx, data in df.groupby(level=0):
    print('---')
    print(data)


---
      c
a b    
1 4  10
  4  11
  5  12
---
      c
a b    
2 5  13
  6  14
---
      c
a b    
3 7  15


In [52]:
for idx, data in df.groupby(level='b'):
    print('---')
    print(data)

---
      c
a b    
1 4  10
  4  11
---
      c
a b    
1 5  12
2 5  13
---
      c
a b    
2 6  14
---
      c
a b    
3 7  15


In [54]:
df = pd.DataFrame([['one', 'A', 100], ['two', 'A', 101], ['three', 'A', 102],
 ['one', 'B', 103], ['two', 'B', 104], ['three', 'B', 105]],
 columns=['c1', 'c2', 'c3'])
df

Unnamed: 0,c1,c2,c3
0,one,A,100
1,two,A,101
2,three,A,102
3,one,B,103
4,two,B,104
5,three,B,105


In [58]:
df_indexed= df.set_index(['c1', 'c2'])

In [56]:
df.set_index(['c1', 'c2']).sort_index()


Unnamed: 0_level_0,Unnamed: 1_level_0,c3
c1,c2,Unnamed: 2_level_1
one,A,100
one,B,103
three,A,102
three,B,105
two,A,101
two,B,104


In [59]:
df_indexed.loc['one']

Unnamed: 0_level_0,c3
c2,Unnamed: 1_level_1
A,100
B,103


In [60]:
 df_indexed.loc['one', 'A']

c3    100
Name: (one, A), dtype: int64

In [84]:
#pd.DataFrame.apply
df = pd.DataFrame(np.random.randint(0,100,size = (7,2)),
 columns = ['fst','snd'])
df

Unnamed: 0,fst,snd
0,37,12
1,39,18
2,10,40
3,16,43
4,93,98
5,52,3
6,79,70


In [85]:
df.apply(np.sqrt)

Unnamed: 0,fst,snd
0,6.082763,3.464102
1,6.244998,4.242641
2,3.162278,6.324555
3,4.0,6.557439
4,9.643651,9.899495
5,7.211103,1.732051
6,8.888194,8.3666


In [86]:
df.apply(np.sum, axis=1)

0     49
1     57
2     50
3     59
4    191
5     55
6    149
dtype: int64

In [87]:
df.apply(np.sum)


fst    326
snd    284
dtype: int64