In [1]:
import pandas
pandas.__version__

'0.22.0'

## 3.1 Introducing Pandas Objects

#### 1. Series object 2.DataFrame object 3. Index object

## i. The Pandas Series Object

In [2]:
import numpy as np
import pandas as pd

In [5]:
data = pd.Series([12,22,23,42])
data

0    12
1    22
2    23
3    42
dtype: int64

In [6]:
data.values

array([12, 22, 23, 42])

In [9]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [10]:
data[1]

22

In [11]:
data[1:3]

1    22
2    23
dtype: int64

In [12]:
data[0:3]

0    12
1    22
2    23
dtype: int64

In [18]:
data = pd.Series([1,2,3,4], index=['a','b','c','d'])
data

a    1
b    2
c    3
d    4
dtype: int64

In [19]:
data['b']

2

In [21]:
#non-contiguous or non-sequential indices
data = pd.Series([1,2,3,4], index=['6','2','4','9'])
data

6    1
2    2
4    3
9    4
dtype: int64

### Series as specialized dictionary

In [144]:
population_dict = {'Delhi': 15000, 'Mumbai':25000, 'Banglore':20000, 'Chennai':10000}
pouplation = pd.Series(population_dict)
pouplation

Banglore    20000
Chennai     10000
Delhi       15000
Mumbai      25000
dtype: int64

In [44]:
pouplation['Delhi']

15000

In [45]:
pouplation['Delhi':'Chennai'] #why blank output 

Series([], dtype: int64)

### Constructing Series objects

In [36]:
pd.Series([11,12,11,14]) #data can be a list or NumPy array #integer sequence is default index

0    11
1    12
2    11
3    14
dtype: int64

In [38]:
pd.Series(5, index=[10,20,30]) #data can be a scalar 

10    5
20    5
30    5
dtype: int64

In [40]:
pd.Series({2:'a', 3:'b', 1:'c'}) #data can be a dictionary #sorted dictionary keys is default index

1    c
2    a
3    b
dtype: object

## ii. The Pandas DataFrame Object

DataFrame can be thought of either as a generalization of a NumPy array, or as a specialization of a Python dictionary. 

### DataFrame as a generalized NumPy array

In [48]:
area_dict = {'Delhi':100, 'Mumbai':150, 'Banglore':130, 'Chennai':200}
area = pd.Series(area_dict)
area

Banglore    130
Chennai     200
Delhi       100
Mumbai      150
dtype: int64

In [49]:
states = pd.DataFrame({'Area':area, 'Population':pouplation})
states

Unnamed: 0,Area,Population
Banglore,130,20000
Chennai,200,10000
Delhi,100,15000
Mumbai,150,25000


In [50]:
states.index

Index(['Banglore', 'Chennai', 'Delhi', 'Mumbai'], dtype='object')

In [51]:
states.columns

Index(['Area', 'Population'], dtype='object')

### DataFrame as specialized dictionary

In [153]:
states['Area']

Banglore    130
Chennai     200
Delhi       100
Mumbai      150
Name: Area, dtype: int64

### Constructing DataFrame objects

A Pandas DataFrame can be constructed in a variety of ways

#### From a single Series object



In [143]:
pd.DataFrame(pouplation, columns=['Population'])

Unnamed: 0,Population
Banglore,20000
Chennai,10000
Delhi,15000
Mumbai,25000


#### From a list of dicts

In [63]:
data = [{1:'a',2:'b',3:'c'}, {4:'d',5:'e',6:'f'}, {5:'g',6:'h'}]
pd.DataFrame(data)

Unnamed: 0,1,2,3,4,5,6
0,a,b,c,,,
1,,,,d,e,f
2,,,,,g,h


#### From a dictionary of Series objects

In [64]:
pd.DataFrame({"population":pouplation, "area":area})

Unnamed: 0,area,population
Banglore,130,20000
Chennai,200,10000
Delhi,100,15000
Mumbai,150,25000


#### From a two-dimensional NumPy array

In [65]:
pd.DataFrame(np.random.rand(3,2), columns=['foo','oof'])

Unnamed: 0,foo,oof
0,0.71238,0.578571
1,0.236502,0.959396
2,0.427786,0.277955


In [66]:
pd.DataFrame(np.random.rand(3,2), columns=['foo','oof'], index=[5,6,7])

Unnamed: 0,foo,oof
5,0.158725,0.651052
6,0.587097,0.115849
7,0.170712,0.030182


#### From a NumPy structured array

In [67]:
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [68]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


## iii. The Pandas Index Object

In [70]:
ind = pd.Index([2,3,5,7,11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

### Index as immutable array

In [71]:
ind[1]

3

In [72]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [73]:
ind[:2]

Int64Index([2, 3], dtype='int64')

In [74]:
ind[1] = 99 #indices are immutable

TypeError: Index does not support mutable operations

### Index as ordered set

In [79]:
indA = pd.Index([1,3,5,8])
indB = pd.Index([2,3,5,7])

In [80]:
indA & indB #intersection

Int64Index([3, 5], dtype='int64')

In [81]:
indA | indB #union

Int64Index([1, 2, 3, 5, 7, 8], dtype='int64')

In [82]:
indA ^ indB #symetric difference

Int64Index([1, 2, 7, 8], dtype='int64')

In [83]:
#also can be used
indA.intersection(indB)

Int64Index([3, 5], dtype='int64')

In [84]:
indA.union(indB)

Int64Index([1, 2, 3, 5, 7, 8], dtype='int64')

In [85]:
indA.symmetric_difference(indB)

Int64Index([1, 2, 7, 8], dtype='int64')

## 3.2 Data Indexing and Selection

### Data Selection in Series

### Series as dictionary

In [86]:
import pandas as pd
data = pd.Series([0.25,0.50,0.75,1.0], index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [87]:
data['b']

0.5

In [88]:
'a' in data

True

In [89]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [99]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [100]:
data['e'] = 1.25

In [101]:
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [102]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0), ('e', 1.25)]

### Series as one-dimensional array

In [103]:
data['a':'c'] #slicing by explicit index

a    0.25
b    0.50
c    0.75
dtype: float64

In [105]:
data[1:3] #slicing by implicit integer index

b    0.50
c    0.75
dtype: float64

In [106]:
(data>0.5) & (data<1.0)

a    False
b    False
c     True
d    False
e    False
dtype: bool

In [112]:
data[(data>0.5) & (data<1.0)] #masking

c    0.75
dtype: float64

In [114]:
data[['a','e']] #fancy indexing

a    0.25
e    1.25
dtype: float64

Among these, slicing may be the source of the most confusion. Notice that when slicing with an explicit index (i.e., data['a':'c']), the final index is included in the slice, while when slicing with an implicit index (i.e., data[0:2]), the final index is excluded from the slice.

### Indexers: loc, iloc, and ix

In [131]:
data = pd.Series(['a','b','c'], index=[1,2,3])
data

1    a
2    b
3    c
dtype: object

In [134]:
data.loc[1:3]

1    a
2    b
3    c
dtype: object

In [135]:
data.iloc[1:3]

2    b
3    c
dtype: object

## Data Selection in DataFrame

## DataFrame as a dictionary

In [136]:
states

Unnamed: 0,Area,Population
Banglore,130,20000
Chennai,200,10000
Delhi,100,15000
Mumbai,150,25000


In [138]:
states['Area']

Banglore    130
Chennai     200
Delhi       100
Mumbai      150
Name: Area, dtype: int64

In [139]:
states.Area

Banglore    130
Chennai     200
Delhi       100
Mumbai      150
Name: Area, dtype: int64

In [163]:
states['Density'] = states['Population'] / states['Area']
states

Unnamed: 0,Area,Population,Density
Banglore,130,20000,153.846154
Chennai,200,10000,50.0
Delhi,100,15000,150.0
Mumbai,150,25000,166.666667


### DataFrame as two-dimensional array

In [165]:
states.loc[states.Density>100, ['Population', 'Density']]

Unnamed: 0,Population,Density
Banglore,20000,153.846154
Delhi,15000,150.0
Mumbai,25000,166.666667


In [166]:
states.T #transpose the matrix

Unnamed: 0,Banglore,Chennai,Delhi,Mumbai
Area,130.0,200.0,100.0,150.0
Population,20000.0,10000.0,15000.0,25000.0
Density,153.846154,50.0,150.0,166.666667


In [167]:
states.loc[states.Density>100, ['Population', 'Density']]

Unnamed: 0,Population,Density
Banglore,20000,153.846154
Delhi,15000,150.0
Mumbai,25000,166.666667


In [170]:
states.iloc[0,2] =99
states

Unnamed: 0,Area,Population,Density
Banglore,130,20000,99.0
Chennai,200,10000,50.0
Delhi,100,15000,150.0
Mumbai,150,25000,166.666667


### Additional indexing conventions

indexing refers to columns, slicing refers to rows:

In [172]:
states['Banglore':'Delhi']

Unnamed: 0,Area,Population,Density
Banglore,130,20000,99.0
Chennai,200,10000,50.0
Delhi,100,15000,150.0


In [174]:
states[0:3] #also refer to rows by number

Unnamed: 0,Area,Population,Density
Banglore,130,20000,99.0
Chennai,200,10000,50.0
Delhi,100,15000,150.0


In [175]:
states[states.Density>100] #direct masking operations are also interpreted row-wise rather than column-wise

Unnamed: 0,Area,Population,Density
Delhi,100,15000,150.0
Mumbai,150,25000,166.666667


## 3.3 Operating on Data in Pandas

### Ufuncs: Index Preservation

In [1]:
import numpy as np
import pandas as pd

In [2]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(10, size=4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [3]:
df = pd.DataFrame(rng.randint(10, size=(3,4)), columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [4]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [5]:
np.log(df)

Unnamed: 0,A,B,C,D
0,1.791759,2.197225,0.693147,1.791759
1,1.94591,1.386294,1.098612,1.94591
2,1.94591,0.693147,1.609438,1.386294


### UFuncs: Index Alignment

### Index alignment in Series

In [9]:
area = pd.Series({'delhi':200, 'mumbai':300, 'chennai':500})
pop = pd.Series({'delhi':1200,'mumbai':1300,'chennai':1500})

In [10]:
area

chennai    500
delhi      200
mumbai     300
dtype: int64

In [11]:
pop

chennai    1500
delhi      1200
mumbai     1300
dtype: int64

In [12]:
pop/area #density

chennai    3.000000
delhi      6.000000
mumbai     4.333333
dtype: float64

In [13]:
area.index | pop.index

Index(['chennai', 'delhi', 'mumbai'], dtype='object')

In [29]:
a = pd.Series([2,4,6], index=[0,1,2])
b = pd.Series([3,5,7], index=[1,2,3])

In [30]:
a+b

0     NaN
1     7.0
2    11.0
3     NaN
dtype: float64

In [17]:
a | b

0     True
1     True
2     True
3    False
dtype: bool

In [19]:
a.index | b.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [20]:
a.add(b) #same as a+b

0     NaN
1     7.0
2    11.0
3     NaN
dtype: float64

In [21]:
a.add(b, fill_value=0) #fill the NaN value

0     2.0
1     7.0
2    11.0
3     7.0
dtype: float64

### Index alignment in DataFrame

In [35]:
a = pd.DataFrame(rng.randint(20, size=(3,2)), columns=list('AB'))
a

Unnamed: 0,A,B
0,13,16
1,3,17
2,7,3


In [36]:
b = pd.DataFrame(rng.randint(10, size=(3,4)), columns=['B','C','D','E'])
b

Unnamed: 0,B,C,D,E
0,1,5,5,9
1,3,5,1,9
2,1,9,3,7


In [37]:
a+b

Unnamed: 0,A,B,C,D,E
0,,17,,,
1,,20,,,
2,,4,,,


In [38]:
a.add(b, fill_value=0)

Unnamed: 0,A,B,C,D,E
0,13.0,17,5.0,5.0,9.0
1,3.0,20,5.0,1.0,9.0
2,7.0,4,9.0,3.0,7.0


### Ufuncs: Operations Between DataFrame and Series

In [39]:
a-a

Unnamed: 0,A,B
0,0,0
1,0,0
2,0,0


In [43]:
b-b

Unnamed: 0,B,C,D,E
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0


## 3.4 Handling Missing Data

Pandas chose to use sentinels for missing data, and further chose to use two already-existing Python null values: the special floating-point NaN value, and the Python None object

### None: Pythonic missing data

In [45]:
import numpy as np
import pandas as pd

In [46]:
val1 = np.array([1, None, 3, 4])
val1

array([1, None, 3, 4], dtype=object)

In [47]:
val1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

addition between an integer and None is undefined.

### NaN: Missing numerical data

In [48]:
val2 = np.array([1,np.NaN, 2,3])
val2

array([ 1., nan,  2.,  3.])

In [49]:
val2.sum() #NaN is a bit like a data virus–it infects any other object it touches

nan

In [50]:
np.NaN +1

nan

In [51]:
np.NaN * 0

nan

In [52]:
np.NaN

nan

In [53]:
np.NAN

nan

In [56]:
val2.sum() ,val2.min() ,val2.max()

(nan, nan, nan)

In [58]:
np.nansum(val2) #NumPy does provide some special aggregations that will ignore these missing values:

6.0

In [59]:
np.nanmax(val2), np.nanmin(val2)

(3.0, 1.0)

NaN is specifically a floating-point value; there is no equivalent NaN value for integers, strings, or other types

### NaN and None in Pandas

In [62]:
pd.Series([1,2,np.NaN,4, None]) #Pandas is built to handle the two of them nearly interchangeably, converting between them where appropriate

0    1.0
1    2.0
2    NaN
3    4.0
4    NaN
dtype: float64

### Operating on Null Values

### Detecting null values

In [63]:
data = pd.Series([1,2,np.NaN, 'hello', None, 9])
data

0        1
1        2
2      NaN
3    hello
4     None
5        9
dtype: object

In [65]:
data.isnull()

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [66]:
data.notnull()

0     True
1     True
2    False
3     True
4    False
5     True
dtype: bool

In [67]:
temp = data.notnull()
data[temp]

0        1
1        2
3    hello
5        9
dtype: object

In [69]:
data[data.notnull()]

0        1
1        2
3    hello
5        9
dtype: object

isnull() and notnull() methods produce similar Boolean results for DataFrames.

### Dropping null values

In [70]:
data.dropna()

0        1
1        2
3    hello
5        9
dtype: object

In [86]:
df = pd.DataFrame([[1,2,np.nan,0,np.nan], [np.nan,3,4,0,np.nan], [6,np.nan,9,0,np.nan],[3,1,7,0,np.nan]])
df

Unnamed: 0,0,1,2,3,4
0,1.0,2.0,,0,
1,,3.0,4.0,0,
2,6.0,,9.0,0,
3,3.0,1.0,7.0,0,


We cannot drop single values from a DataFrame; we can only drop full rows or full columns. By default, dropna() will drop all rows in which any null value is present

In [87]:
df.dropna()

Unnamed: 0,0,1,2,3,4


In [88]:
df.dropna(axis='columns')

Unnamed: 0,3
0,0
1,0
2,0
3,0


In [89]:
df.dropna(axis=1)

Unnamed: 0,3
0,0
1,0
2,0
3,0


In [90]:
df.dropna(how='all')

Unnamed: 0,0,1,2,3,4
0,1.0,2.0,,0,
1,,3.0,4.0,0,
2,6.0,,9.0,0,
3,3.0,1.0,7.0,0,


In [91]:
df.dropna(how='any') #any is default

Unnamed: 0,0,1,2,3,4


In [92]:
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2,3
0,1.0,2.0,,0
1,,3.0,4.0,0
2,6.0,,9.0,0
3,3.0,1.0,7.0,0


thresh parameter lets you specify a minimum number of non-null values for the row/column to be kept

In [95]:
df.dropna(thresh=4)

Unnamed: 0,0,1,2,3,4
3,3.0,1.0,7.0,0,


In [96]:
df.dropna(axis=1, thresh=4)

Unnamed: 0,3
0,0
1,0
2,0
3,0


### Filling null values

In [101]:
data = pd.Series([1,2,np.NaN,'hello',np.NaN,5,6,np.NaN], index=list('abcdefgh'))
data

a        1
b        2
c      NaN
d    hello
e      NaN
f        5
g        6
h      NaN
dtype: object

In [102]:
data.fillna(0)

a        1
b        2
c        0
d    hello
e        0
f        5
g        6
h        0
dtype: object

In [103]:
data.fillna(method='ffill') #forward fill

a        1
b        2
c        2
d    hello
e    hello
f        5
g        6
h        6
dtype: object

In [104]:
data.fillna(method='bfill') ## back-fill

a        1
b        2
c    hello
d    hello
e        5
f        5
g        6
h      NaN
dtype: object

In [105]:
df

Unnamed: 0,0,1,2,3,4
0,1.0,2.0,,0,
1,,3.0,4.0,0,
2,6.0,,9.0,0,
3,3.0,1.0,7.0,0,


In [107]:
df.fillna(method='ffill',axis=1) #rowwise

Unnamed: 0,0,1,2,3,4
0,1.0,2.0,2.0,0.0,0.0
1,,3.0,4.0,0.0,0.0
2,6.0,6.0,9.0,0.0,0.0
3,3.0,1.0,7.0,0.0,0.0


In [108]:
df.fillna(method='ffill',axis=0) 

Unnamed: 0,0,1,2,3,4
0,1.0,2.0,,0,
1,1.0,3.0,4.0,0,
2,6.0,3.0,9.0,0,
3,3.0,1.0,7.0,0,


In [109]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2,3,4
0,1.0,2.0,,0,
1,1.0,3.0,4.0,0,
2,6.0,3.0,9.0,0,
3,3.0,1.0,7.0,0,


## 3.5 Hierarchical indexing

In [None]:
reference: Python Data Science Handbook_Oreilly