## Pandas

### Series

A Series is very similar to Numpy array- infact built on top of it. However, series can have axis labels- it can be indexed using these labels- instead of just numbers and it can hold any python object.

In [42]:
import numpy as np
import pandas as pd

In [43]:
# Creating array
arr = np.array([1,2,3])

# Creating labels
labels = ['a','b','c']

# Creating list
list = [10,20,30]

# Creating dictonaries
dir = {'a':10,'b':20,'c':30}


In [44]:
# Creating series using the array
ser1 = pd.Series(data=arr)
ser2 = pd.Series(arr,labels)
print (ser1)
print (ser2)

# Accesing the data in the series
print (ser1[0])
print (ser2['a'])

0    1
1    2
2    3
dtype: int32
a    1
b    2
c    3
dtype: int32
1
1


In [45]:
# Creating series using the list 
pd.Series(list)

0    10
1    20
2    30
dtype: int64

In [46]:
# Creating series from the dictonaries
pd.Series(dir)

a    10
b    20
c    30
dtype: int64

In [91]:
# Labels of the Series need not be unique
ser3 = pd.Series([1,2,3,4],index=['A','B','D','D'])
# or
ser4 = pd.Series([1,2,3,4],'A B D D'.split())
print(ser3)
print (ser4)
ser3['D']

A    1
B    2
D    3
D    4
dtype: int64
A    1
B    2
D    3
D    4
dtype: int64


D    3
D    4
dtype: int64

# DataFrames

A Data frame is a two dimensional data structure whose features are-
1. Potential columns are of different data type.
2. Size is mutable
3. Labeled axes (both row and column)
4. Can perform arithmetic operations on rows and columns

In [48]:
np.random.seed(101)

In [49]:
# Creating the data frame
df = pd.DataFrame(np.random.randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### Selection and Indexing

In [50]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [51]:
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [52]:
# Data frames are nothing but series
print (type(df['W']))

<class 'pandas.core.series.Series'>


In [53]:
# Creating a new column
df['new'] = df['Y']+df['Z']+df['W']

In [54]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,4.118645
B,0.651118,-0.319318,-0.848077,0.605965,0.409006
C,-2.018168,0.740122,0.528813,-0.589001,-2.078355
D,0.188695,-0.758872,-0.933237,0.955057,0.210515
E,0.190794,1.978757,2.605967,0.683509,3.48027


In [55]:
# Removing the new column. By default, the removal of columns are not inplace.
df.drop('new',axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [56]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,4.118645
B,0.651118,-0.319318,-0.848077,0.605965,0.409006
C,-2.018168,0.740122,0.528813,-0.589001,-2.078355
D,0.188695,-0.758872,-0.933237,0.955057,0.210515
E,0.190794,1.978757,2.605967,0.683509,3.48027


In [57]:
# To remove column permanently, inplace has to be specified explicitly
df.drop('new',axis=1,inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [58]:
# Locating a row
df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [59]:
# Locating based on index
df.iloc[0]

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [60]:
# Selecting an element from the data frame
# Note- df.loc is needed to access a particular row; the ouput of which is a series. For the obtained series, even 
#       indexing can be directly used to access the element.
print (df.loc['A','Z'])
print ('or')
print (df.loc['A']['Z'])

0.5038257538223936
or
0.5038257538223936


In [61]:
# Selecting a subset from the data frame
df.loc[['A','C'],['X','Y']]

Unnamed: 0,X,Y
A,0.628133,0.907969
C,0.740122,0.528813


### Conditional Selection

In [62]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [63]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [64]:
# When conditional selection is applied on whole data frame and it is printed, the cells which were false will return NaN 
df[df>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [65]:
# Applying conditional selection on columns 
boolseries = df['X']>0 
print ('BoolSeries:')
print (boolseries)
print ("========================================")
booldf = df[boolseries]
print ('Booldf:')
print (booldf)
print ("========================================")
# Selecting series after conditional selection
resdf = booldf[['X','Z']]
print ('Resdf:')
print (resdf)

BoolSeries:
A     True
B    False
C     True
D    False
E     True
Name: X, dtype: bool
Booldf:
          W         X         Y         Z
A  2.706850  0.628133  0.907969  0.503826
C -2.018168  0.740122  0.528813 -0.589001
E  0.190794  1.978757  2.605967  0.683509
Resdf:
          X         Z
A  0.628133  0.503826
C  0.740122 -0.589001
E  1.978757  0.683509


In [66]:
# Above operation in one line
df[df['X']>0][['X','Z']]

Unnamed: 0,X,Z
A,0.628133,0.503826
C,0.740122,-0.589001
E,1.978757,0.683509


In [67]:
# Two contitional selection
# Note- When boolean operation is applied for a series, then '&' has to be uesd instead of 'and' 

df[(df['W']>0) & (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [68]:
# Reset to default indexing. However, this is temporary and to make it permanent, use inplace=true option
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [69]:
newcol = 'AA BB CC DD EE'.split()
df['New'] = newcol
df

Unnamed: 0,W,X,Y,Z,New
A,2.70685,0.628133,0.907969,0.503826,AA
B,0.651118,-0.319318,-0.848077,0.605965,BB
C,-2.018168,0.740122,0.528813,-0.589001,CC
D,0.188695,-0.758872,-0.933237,0.955057,DD
E,0.190794,1.978757,2.605967,0.683509,EE


In [70]:
# Make the new column as index. However, this is temporary and to make it permanent, use inplace=true option
df.set_index('New')

Unnamed: 0_level_0,W,X,Y,Z
New,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AA,2.70685,0.628133,0.907969,0.503826
BB,0.651118,-0.319318,-0.848077,0.605965
CC,-2.018168,0.740122,0.528813,-0.589001
DD,0.188695,-0.758872,-0.933237,0.955057
EE,0.190794,1.978757,2.605967,0.683509


In [71]:
df

Unnamed: 0,W,X,Y,Z,New
A,2.70685,0.628133,0.907969,0.503826,AA
B,0.651118,-0.319318,-0.848077,0.605965,BB
C,-2.018168,0.740122,0.528813,-0.589001,CC
D,0.188695,-0.758872,-0.933237,0.955057,DD
E,0.190794,1.978757,2.605967,0.683509,EE


### Multi-Indexing and Index Hierarchy

In [77]:
# Index level
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [78]:
mdf = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])
mdf

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [79]:
# Accessing a group in the data frame
mdf.loc['G1']

Unnamed: 0,A,B
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [81]:
# Accessing a particular row in the group
mdf.loc['G1'].loc[1]

A    0.302665
B    1.693723
Name: 1, dtype: float64

In [83]:
# By default no names are assigned to group
mdf.index.names

FrozenList([None, None])

In [85]:
# Assigning a name to the group
mdf.index.names=['Group','Num']
mdf

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [86]:
# xs function will be better while accessing a multi-hirearchy data frames.
mdf.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [88]:
mdf.xs(('G1',1))

A    0.302665
B    1.693723
Name: (G1, 1), dtype: float64

In [89]:
# Just accessing Num 3 from the data frame
mdf.xs(1,level='Num')

Unnamed: 0_level_0,A,B
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.302665,1.693723
G2,0.166905,0.184502
