# Rank and Sort

In [2]:
import numpy as np
from pandas import Series,DataFrame
import pandas as pd
from numpy.random import randn

In [4]:
#Sorting by index
ser1 = Series(range(3),index=['C','A','B'])

#show
ser1

C    0
A    1
B    2
dtype: int64

In [5]:
#Now sort_index
ser1.sort_index()

A    1
B    2
C    0
dtype: int64

In [6]:
#Can sort a Series by its values
ser1.order()

C    0
A    1
B    2
dtype: int64

In [10]:
#Lets see how ranking works

from numpy.random import randn
ser2 = Series(randn(10))

#Show
ser2

0    0.524553
1   -1.987343
2   -0.883902
3   -0.875829
4    0.216089
5    0.744837
6   -0.761465
7    0.792798
8   -0.144650
9    0.100972
dtype: float64

In [11]:
#This will show you the rank used if you sort the series
ser2.rank()

0     8
1     1
2     2
3     3
4     7
5     9
6     4
7    10
8     5
9     6
dtype: float64

In [13]:
#Lets sort it now
ser2.sort()

#Show
ser2

1   -1.987343
2   -0.883902
3   -0.875829
6   -0.761465
8   -0.144650
9    0.100972
4    0.216089
0    0.524553
5    0.744837
7    0.792798
dtype: float64

In [15]:
#After sorting let's check the rank and see iof it makes sense
ser2.rank()

1     1
2     2
3     3
6     4
8     5
9     6
4     7
0     8
5     9
7    10
dtype: float64

In [16]:
#On the left column we see th original index value and on the right we see it's rank!

# Missing Data

In [2]:
#Now we'll learn how to deal with missing data, a very common task when analyzing datasets!

data = Series(['one','two', np.nan, 'four'])

In [3]:
#Show data
data

0     one
1     two
2     NaN
3    four
dtype: object

In [5]:
#Find the missing values
data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
#We can simply drop the NAN 
data.dropna()

0     one
1     two
3    four
dtype: object

In [14]:
# In a DataFrame we need to be a little more careful!

dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])

In [15]:
#Show
dframe

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [16]:
clean_dframe = dframe.dropna()

In [17]:
#Show
clean_dframe

Unnamed: 0,0,1,2
0,1,2,3


In [12]:
#Note all rows where an NA occured was a drop of the entire row

In [18]:
#We can also specify to only drop rows that are complete missing all data
dframe.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,2.0,3
1,,5.0,6
2,7.0,,9


In [22]:
#Or we can specify to drop columns with missing data
dframe.dropna(axis=1)

#This should drop all columns out since every column contains at least 1 NAN

0
1
2
3


In [26]:
#We can also threshold teh missing data as well

#For example if we only want rows with at least 3 data points
dframe2 = DataFrame([[1,2,3,np.nan],[2,np.nan,5,6],[np.nan,7,np.nan,9],[1,np.nan,np.nan,np.nan]])

#Show
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [28]:
#Droppin any rows tht dont have at least 2 data points
dframe2.dropna(thresh=2)

Unnamed: 0,0,1,2,3
0,1,2.0,3,
1,2,,5,6.0


In [29]:
#Dropiing rows without at least 3 data points
dframe2.dropna(thresh=3)

Unnamed: 0,0,1,2,3
0,1,2.0,3,
1,2,,5,6.0


In [30]:
#We can also fill any NAN
dframe2.fillna(1)

Unnamed: 0,0,1,2,3
0,1,2,3,1
1,2,1,5,6
2,1,7,1,9
3,1,1,1,1


In [33]:
#Can also fill in diff values for diff columns
dframe2.fillna({0:0,1:1,2:2,3:3})

Unnamed: 0,0,1,2,3
0,1,2,3,3
1,2,1,5,6
2,0,7,2,9
3,1,1,2,3


In [34]:
#Note that we still have access to the original dframe
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [35]:
#If we want to modify the exsisting object, use inplace
dframe2.fillna(0,inplace=True)

In [36]:
#Now let's see the dframe
dframe2

Unnamed: 0,0,1,2,3
0,1,2,3,0
1,2,0,5,6
2,0,7,0,9
3,1,0,0,0


# Index Hierarchy

In [2]:
#Now we'll learn about Index Hierarchy

#pandas allows you to have multiple index levels, which is very clear with this example:

ser = Series(np.random.randn(6),index=[[1,1,1,2,2,2],['a','b','c','a','b','c']])

In [3]:
#Show Series with multiple index levels
ser

1  a    0.596760
   b    0.348350
   c    0.708594
2  a    0.757104
   b   -1.933524
   c   -0.432283
dtype: float64

In [4]:
# We can check the multiple levels
ser.index

MultiIndex([(1, 'a'),
            (1, 'b'),
            (1, 'c'),
            (2, 'a'),
            (2, 'b'),
            (2, 'c')],
           )

In [5]:
#Now we can sleect specific subsets
ser[1]

a    0.596760
b    0.348350
c    0.708594
dtype: float64

In [6]:
# We can also select from an internal index level
ser[:,'a']

1    0.596760
2    0.757104
dtype: float64

In [7]:
# We can also create Data Frames from Series with multiple levels
dframe = ser.unstack()

#Show
dframe

Unnamed: 0,a,b,c
1,0.59676,0.34835,0.708594
2,0.757104,-1.933524,-0.432283


In [8]:
#Can also reverse
dframe.unstack()

a  1    0.596760
   2    0.757104
b  1    0.348350
   2   -1.933524
c  1    0.708594
   2   -0.432283
dtype: float64

In [28]:
# We can also apply multiple level indexing to DataFrames
dframe2 = DataFrame(np.arange(16).reshape(4,4),
                    index=[['a','a','b','b'],[1,2,1,2]],
                    columns=[['NY','NY','LA','SF'],['cold','hot','hot','cold']])
                                                   
dframe2                                                

Unnamed: 0_level_0,Unnamed: 1_level_0,NY,NY,LA,SF
Unnamed: 0_level_1,Unnamed: 1_level_1,cold,hot,hot,cold
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [31]:
# We can also give these index levels names

#Name the index levels
dframe2.index.names = ['INDEX_1','INDEX_2']

#Name the column levels
dframe2.columns.names = ['Cities','Temp']

dframe2

Unnamed: 0_level_0,Cities,NY,NY,LA,SF
Unnamed: 0_level_1,Temp,cold,hot,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [33]:
# We can also interchange level orders (note the axis=1 for columns)
dframe2.swaplevel('Cities','Temp',axis=1)

Unnamed: 0_level_0,Temp,cold,hot,hot,cold
Unnamed: 0_level_1,Cities,NY,NY,LA,SF
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [34]:
#We can also sort levels
dframe2.sortlevel(1)

Unnamed: 0_level_0,Cities,NY,NY,LA,SF
Unnamed: 0_level_1,Temp,cold,hot,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
b,1,8,9,10,11
a,2,4,5,6,7
b,2,12,13,14,15


In [35]:
#Note the change in sorting, now the Dframe index is sorted by the INDEX_2

In [37]:
#We can also perform operations on particular levels
dframe2.sum(level='Temp',axis=1)

Unnamed: 0_level_0,Temp,cold,hot
INDEX_1,INDEX_2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,3,3
a,2,11,11
b,1,19,19
b,2,27,27
