In [1]:
import pandas as pd

In [4]:
from pandas import DataFrame,Series

To get started with pandas, you will need to get comfortable with its two workhorse
data structures: Series and DataFrame. While they are not a universal solution for
every problem, they provide a solid, easy-to-use basis for most applications.

In [5]:
obj=pd.Series([0.7,3,4,1,0.78])

In [7]:
obj

0    0.70
1    3.00
2    4.00
3    1.00
4    0.78
dtype: float64

In [8]:
obj.values

array([0.7 , 3.  , 4.  , 1.  , 0.78])

In [9]:
obj.index

RangeIndex(start=0, stop=5, step=1)

In [10]:
obj=pd.Series([2,3,4,5],index=['d','a','c','b'])

In [11]:
obj

d    2
a    3
c    4
b    5
dtype: int64

In [12]:
obj.index

Index(['d', 'a', 'c', 'b'], dtype='object')

In [13]:
obj.values

array([2, 3, 4, 5])

In [14]:
obj['a']

3

In [15]:
obj['d']

2

In [16]:
obj[['a','b']]

a    3
b    5
dtype: int64

In [17]:
obj[obj>3]

c    4
b    5
dtype: int64

In [18]:
obj*3

d     6
a     9
c    12
b    15
dtype: int64

In [21]:
import numpy as np

In [22]:
np.exp(obj)

d      7.389056
a     20.085537
c     54.598150
b    148.413159
dtype: float64

In [23]:
'd' in obj

True

In [26]:
2 in obj

False

In [27]:
2 in obj.values

True

In [28]:
d={'a':12,'b':900,'c':10.4}

In [29]:
obj1=pd.Series(d)

In [30]:
obj1

a     12.0
b    900.0
c     10.4
dtype: float64

In [32]:
s=['a','e','r','b','c']
obj2=pd.Series(d,index=s)

In [33]:
obj2

a     12.0
e      NaN
r      NaN
b    900.0
c     10.4
dtype: float64

In [34]:
pd.isnull(obj2)

a    False
e     True
r     True
b    False
c    False
dtype: bool

In [36]:
obj2[obj2.isnull()]

e   NaN
r   NaN
dtype: float64

In [37]:
obj2[obj2>12]

b    900.0
dtype: float64

In [38]:
pd.notnull(obj2)

a     True
e    False
r    False
b     True
c     True
dtype: bool

In [39]:
obj1.notnull()

a    True
b    True
c    True
dtype: bool

In [40]:
obj1+obj

a     15.0
b    905.0
c     14.4
d      NaN
dtype: float64

In [42]:
obj2.name='data3'
obj2.index.name='data index'

In [43]:
obj2

data index
a     12.0
e      NaN
r      NaN
b    900.0
c     10.4
Name: data3, dtype: float64

In [45]:
obj.index=['puchinki','nipovka','mansion','base']

In [46]:
obj

puchinki    2
nipovka     3
mansion     4
base        5
dtype: int64

## DataFrame
A DataFrame represents a rectangular table of data and contains an ordered collec‐
tion of columns, each of which can be a different value type (numeric, string,
boolean, etc.). The DataFrame has both a row and column index; it can be thought of
as a dict of Series all sharing the same index.

In [47]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [48]:
df=pd.DataFrame(data)

In [49]:
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [50]:
df.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [51]:
pd.DataFrame(data,columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [52]:
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [54]:
frame1=pd.DataFrame(data,columns=['year','pop','state','city'])

In [55]:
frame1

Unnamed: 0,year,pop,state,city
0,2000,1.5,Ohio,
1,2001,1.7,Ohio,
2,2002,3.6,Ohio,
3,2001,2.4,Nevada,
4,2002,2.9,Nevada,
5,2003,3.2,Nevada,


In [71]:
frame1=pd.DataFrame(data,columns=['year','pop','state','city'],index=['one','two','three','four','five','six'])

In [72]:
frame1

Unnamed: 0,year,pop,state,city
one,2000,1.5,Ohio,
two,2001,1.7,Ohio,
three,2002,3.6,Ohio,
four,2001,2.4,Nevada,
five,2002,2.9,Nevada,
six,2003,3.2,Nevada,


In [60]:
frame1.columns

Index(['year', 'pop', 'state', 'city'], dtype='object')

In [62]:
frame1['year']

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [68]:
frame1.year='y'

In [69]:
frame1.year

one      y
two      y
three    y
four     y
five     y
six      y
Name: year, dtype: object

In [70]:
frame1

Unnamed: 0,year,pop,state,city
one,y,1.5,Ohio,
two,y,1.7,Ohio,
three,y,3.6,Ohio,
four,y,2.4,Nevada,
five,y,2.9,Nevada,
six,y,3.2,Nevada,


In [73]:
frame1

Unnamed: 0,year,pop,state,city
one,2000,1.5,Ohio,
two,2001,1.7,Ohio,
three,2002,3.6,Ohio,
four,2001,2.4,Nevada,
five,2002,2.9,Nevada,
six,2003,3.2,Nevada,


Rows can also be retrieved by position or name with the special loc attribute (much
more on this later):

In [74]:

frame1.loc['two']

year     2001
pop       1.7
state    Ohio
city      NaN
Name: two, dtype: object

In [76]:
frame1['city']='pune'

In [77]:
frame1

Unnamed: 0,year,pop,state,city
one,2000,1.5,Ohio,pune
two,2001,1.7,Ohio,pune
three,2002,3.6,Ohio,pune
four,2001,2.4,Nevada,pune
five,2002,2.9,Nevada,pune
six,2003,3.2,Nevada,pune


In [78]:
val=pd.Series([1.2,0.5,5],index=['two','three','one'])

In [79]:
frame1['value']=val

In [80]:
frame1

Unnamed: 0,year,pop,state,city,value
one,2000,1.5,Ohio,pune,5.0
two,2001,1.7,Ohio,pune,1.2
three,2002,3.6,Ohio,pune,0.5
four,2001,2.4,Nevada,pune,
five,2002,2.9,Nevada,pune,
six,2003,3.2,Nevada,pune,


The del method can then be used to remove this column:

In [82]:
del frame1['value']

In [83]:
frame1

Unnamed: 0,year,pop,state,city
one,2000,1.5,Ohio,pune
two,2001,1.7,Ohio,pune
three,2002,3.6,Ohio,pune
four,2001,2.4,Nevada,pune
five,2002,2.9,Nevada,pune
six,2003,3.2,Nevada,pune


In [93]:
pop={'state':{'UP':1,'MP':2,'Delhi':3,'HP':4}, 'city':{'UP':5,'Ind':5,'dwk':7,'Shim':8}}

In [94]:
frame2=pd.DataFrame(pop)

In [95]:
frame2

Unnamed: 0,state,city
Delhi,3.0,
HP,4.0,
Ind,,5.0
MP,2.0,
Shim,,8.0
UP,1.0,5.0
dwk,,7.0


You can transpose the DataFrame (swap rows and columns) with similar syntax to a
NumPy array:

In [96]:
frame2.T

Unnamed: 0,Delhi,HP,Ind,MP,Shim,UP,dwk
state,3.0,4.0,,2.0,,1.0,
city,,,5.0,,8.0,5.0,7.0


In [97]:
frame1.T

Unnamed: 0,one,two,three,four,five,six
year,2000,2001,2002,2001,2002,2003
pop,1.5,1.7,3.6,2.4,2.9,3.2
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
city,pune,pune,pune,pune,pune,pune


In [99]:
pd.DataFrame(pop,index=['a','b','c','MP'])

Unnamed: 0,state,city
a,,
b,,
c,,
MP,2.0,


In [103]:
pdata={'nevada':frame1['Nevada'][:2]}

KeyError: 'Nevada'

In [104]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [105]:
frame3=pd.DataFrame(pop)

In [106]:
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [107]:
pdata={'Ohio':frame3['Ohio'][:-1],'Nevada':frame3['Nevada'][:2]}

In [108]:
frame4=pd.DataFrame(pdata)

In [109]:
frame4

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [110]:
frame4.index.name='year'

In [111]:
frmae4

NameError: name 'frmae4' is not defined

In [112]:
frame4

Unnamed: 0_level_0,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4


In [114]:
frame4.columns.name='state'

In [115]:
frame4

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4


In [116]:
frame2.values

array([[ 3., nan],
       [ 4., nan],
       [nan,  5.],
       [ 2., nan],
       [nan,  8.],
       [ 1.,  5.],
       [nan,  7.]])

In [117]:
frame4.values

array([[1.5, nan],
       [1.7, 2.4]])

In [121]:
frame2.values

array([[ 3., nan],
       [ 4., nan],
       [nan,  5.],
       [ 2., nan],
       [nan,  8.],
       [ 1.,  5.],
       [nan,  7.]])

In [122]:
pd.DataFrame(np.arange(9).reshape(3,3))


Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


In [124]:
obj=pd.Series(range(3),index=['a','b','c'])

In [125]:
index=obj.index

In [126]:
index

Index(['a', 'b', 'c'], dtype='object')

In [127]:
index[:2]

Index(['a', 'b'], dtype='object')

In [128]:
index[1]='d'

TypeError: Index does not support mutable operations

index objects are immutable and thus can't be modified by the user

In [131]:
labels=pd.Index(np.arange(10))

In [133]:
labels

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [136]:
pd.DataFrame(np.arange(9).reshape(3,3),index=['a','a','c'],columns=['val1','val2','val3'])

Unnamed: 0,val1,val2,val3
a,0,1,2
a,3,4,5
c,6,7,8


In [137]:
frame4=pd.DataFrame(np.arange(9).reshape(3,3),index=['a','a','c'],columns=['val1','val2','val3'])

In [139]:
frame4.index

Index(['a', 'a', 'c'], dtype='object')

In [140]:
lables=pd.Index(['aa','bb','cc'])

In [141]:
type(lables)

pandas.core.indexes.base.Index

#### Essential FUnctionality


###### Reindexing

An important method on pandas objects is reindex , which means to create a new
object with the data conformed to a new index. Consider an example:

In [142]:
frame1=pd.DataFrame([1,2,3,4],index=['a','b','c','d'])

In [143]:
frame1

Unnamed: 0,0
a,1
b,2
c,3
d,4


In [144]:
frame1.reindex(['b','d','f','a','c'])

Unnamed: 0,0
b,2.0
d,4.0
f,
a,1.0
c,3.0


In [145]:
frame1

Unnamed: 0,0
a,1
b,2
c,3
d,4


In [148]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[2,0, 4])

In [149]:
obj3

2      blue
0    purple
4    yellow
dtype: object

In [152]:
obj3.reindex(range(4),method='ffill')

ValueError: index must be monotonic increasing or decreasing

In [153]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0,2, 4])

In [154]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [156]:
obj3.reindex(range(9),method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
6    yellow
7    yellow
8    yellow
dtype: object

In [158]:
obj3.reindex(['a',0,2])

a       NaN
0      blue
2    purple
dtype: object

In [162]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),index=['a', 'c', 'd'],columns=['Ohio', 'Texas', 'California'])

In [163]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [164]:
frame2=frame.reindex(['a','b','c','d'])

In [165]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [166]:
states = ['Texas', 'Utah', 'California']

In [172]:
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [171]:
frame.reindex(columns=['u','T'])

Unnamed: 0,u,T
a,,
c,,
d,,


drop for deleting index(rows)

In [173]:
data=pd.DataFrame(np.arange(16).reshape(4,4),index=['Ohio', 'Colorado', 'Utah', 'New York'],columns=['one', 'two', 'three', 'four'])

In [174]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [175]:
data.drop('Ohio')

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [176]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [180]:
data.drop(['Ohio','Utah'])

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
New York,12,13,14,15


In [188]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [189]:
data.drop('two',axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [191]:
data.drop(['one','three'],axis='columns')

Unnamed: 0,two,four
Ohio,1,3
Colorado,5,7
Utah,9,11
New York,13,15


In [192]:
obj

a    0
b    1
c    2
dtype: int64

In [193]:
obj.drop('c',inplace=True)

In [194]:
obj

a    0
b    1
dtype: int64

#### Indexing, Selection, and Filtering

Series indexing ( obj[...] ) works analogously to NumPy array indexing, except you
can use the Series’s index values instead of only integers. Here are some examples of
this:

In [195]:
s1=pd.Series(np.arange(4.),index=['a','b','c','d'])

In [196]:
s1

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [197]:
s1[:2]

a    0.0
b    1.0
dtype: float64

In [198]:
s1['d']

3.0

In [199]:
s1[[1,2]]

b    1.0
c    2.0
dtype: float64

In [200]:
s1[[3]]

d    3.0
dtype: float64

In [201]:
s1[[1,3]]

b    1.0
d    3.0
dtype: float64

In [202]:
s1[s1<3]

a    0.0
b    1.0
c    2.0
dtype: float64

In [203]:
s1[0]

0.0

In [205]:
s1[2]

2.0

In [206]:
s1[-1]

3.0

In [207]:
s1[:-1]

a    0.0
b    1.0
c    2.0
dtype: float64

In [208]:
s1['a':'c']

a    0.0
b    1.0
c    2.0
dtype: float64

In [209]:
s1['b':'d']=5

In [210]:
s1

a    0.0
b    5.0
c    5.0
d    5.0
dtype: float64

In [211]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [212]:
frame['Ohio']

a    0
c    3
d    6
Name: Ohio, dtype: int64

In [213]:
frame[['Ohio','Texas']]

Unnamed: 0,Ohio,Texas
a,0,1
c,3,4
d,6,7


In [214]:
frame[:2]

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5


In [215]:
frame[frame['Ohio']>2]

Unnamed: 0,Ohio,Texas,California
c,3,4,5
d,6,7,8


In [216]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [217]:
frame.reindex(index=['a','b','c','d'])

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [218]:
s1.loc['b']

5.0

In [219]:
s1.loc['a']

0.0

In [221]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [222]:
frame<5

Unnamed: 0,Ohio,Texas,California
a,True,True,True
c,True,True,False
d,False,False,False


In [223]:
frame[frame<5]=0

In [224]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,0,0
c,0,0,5
d,6,7,8


In [225]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,0,0
c,0,0,5
d,6,7,8


In [228]:
frame.loc[['a','d']]

Unnamed: 0,Ohio,Texas,California
a,0,0,0
d,6,7,8


In [229]:
frame.loc['c',['Texas','Ohio']]

Texas    0
Ohio     0
Name: c, dtype: int64

In [231]:
ser=pd.Series(np.arange(6.))

In [237]:
ser

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
5    5.0
dtype: float64

In [245]:
ser=pd.Series(np.arange(3),index=['a','b','c'])

In [246]:
ser[-1]

2

In [248]:
ser=pd.Series(np.random.randn(3),index=[0,1,2])

In [249]:
ser[-1]

KeyError: -1

In [250]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),index=['Ohio', 'Texas', 'Colorado'])

In [251]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [253]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [254]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [255]:
df1+df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [264]:
df2.loc[0,'e']=np.nan

In [265]:
df2

Unnamed: 0,b,d,e,"(0, e)","(Utah, e)"
Utah,0.0,1.0,,,
Ohio,3.0,4.0,5.0,,
Texas,6.0,7.0,8.0,,
Oregon,9.0,10.0,11.0,,
3,,,,,
0,,,,,


In [270]:
df1.add(df2,fill_value=0)

  other.columns, how=join, level=level, return_indexers=True)


Unnamed: 0,b,c,d,e,"(0, e)","(Utah, e)"
0,,,,,,
3,,,,,,
Colorado,6.0,7.0,8.0,,,
Ohio,3.0,1.0,6.0,5.0,,
Oregon,9.0,,10.0,11.0,,
Texas,9.0,4.0,12.0,8.0,,
Utah,0.0,,1.0,,,


In [276]:
df1*10

Unnamed: 0,b,c,d
Ohio,0.0,10.0,20.0
Texas,30.0,40.0,50.0
Colorado,60.0,70.0,80.0


#### Function Application and Mapping

NumPy ufuncs (element-wise array methods) also work with pandas objects:

In [277]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [278]:
frame

Unnamed: 0,b,d,e
Utah,0.033526,0.219741,-0.402631
Ohio,-1.12038,0.014851,-0.318537
Texas,0.54468,-0.749793,-0.797319
Oregon,-0.36241,-0.779975,1.252411


In [279]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.033526,0.219741,0.402631
Ohio,1.12038,0.014851,0.318537
Texas,0.54468,0.749793,0.797319
Oregon,0.36241,0.779975,1.252411


In [280]:
f=lambda x:x.max()-x.min()

In [291]:
frame.apply(f,axis=1)

Utah      0.622372
Ohio      1.135231
Texas     1.342000
Oregon    2.032386
dtype: float64

In [284]:
frame.apply(f,axis=0)

b    1.665061
d    0.999717
e    2.049730
dtype: float64

In [287]:
def func(x):
    return pd.Series([x.min(),x.max()],index=['min','max'])

In [288]:
frame.apply(func)

Unnamed: 0,b,d,e
min,-1.12038,-0.779975,-0.797319
max,0.54468,0.219741,1.252411


In [289]:
frame.apply(func,axis='columns')

Unnamed: 0,min,max
Utah,-0.402631,0.219741
Ohio,-1.12038,0.014851
Texas,-0.797319,0.54468
Oregon,-0.779975,1.252411


In [294]:
format=lambda x: '%.2f'%x

In [295]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.03,0.22,-0.4
Ohio,-1.12,0.01,-0.32
Texas,0.54,-0.75,-0.8
Oregon,-0.36,-0.78,1.25


The reason for the name applymap is that Series has a map method for applying an
element-wise function

In [300]:
frame['e'].apply(format)

Utah      -0.40
Ohio      -0.32
Texas     -0.80
Oregon     1.25
Name: e, dtype: object

In [301]:
frame['e'].map(format)

Utah      -0.40
Ohio      -0.32
Texas     -0.80
Oregon     1.25
Name: e, dtype: object

#### Sorting and Ranking

In [302]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])

In [303]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [305]:
obj.sort_values()

d    0
a    1
b    2
c    3
dtype: int64

In [306]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),index=['three', 'one'],columns=['d', 'a', 'b', 'c'])

In [307]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [308]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [311]:
frame.sort_values(['a','b'])

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [313]:
frame['b']['three']=9

In [314]:
frame

Unnamed: 0,d,a,b,c
three,0,1,9,3
one,4,5,6,7


In [315]:
frame.sort_values(['b','a'])

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,9,3


In [316]:
frame.sort_index(axis='columns')

Unnamed: 0,a,b,c,d
three,1,9,3,0
one,5,6,7,4


In [317]:
frame.sort_index(axis='columns',ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,9,1
one,4,7,6,5


Any missing values are sorted to the end of the Series by default:

In [319]:
obj=pd.Series([3.2,np.nan,4,np.nan,7,-3,-2])

In [320]:
obj

0    3.2
1    NaN
2    4.0
3    NaN
4    7.0
5   -3.0
6   -2.0
dtype: float64

In [321]:
obj.sort_values()

5   -3.0
6   -2.0
0    3.2
2    4.0
4    7.0
1    NaN
3    NaN
dtype: float64

In [322]:
frame.sort_values(by='b')

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,9,3


In [323]:
frame.sort_values(by=['a','b'])

Unnamed: 0,d,a,b,c
three,0,1,9,3
one,4,5,6,7


#### Ranking

Ranking assigns ranks from one through the number of valid data points in an array.
The rank methods for Series and DataFrame are the place to look; by default rank
breaks ties by assigning each group the mean rank:

In [324]:
obj=pd.Series(C)

In [325]:
obj

0   -4
1   -2
2    7
3    3
4    2
5    5
6    1
7    5
dtype: int64

In [326]:
obj.rank()

0    1.0
1    2.0
2    8.0
3    5.0
4    4.0
5    6.5
6    3.0
7    6.5
dtype: float64

In [327]:
obj.rank(method='first')

0    1.0
1    2.0
2    8.0
3    5.0
4    4.0
5    6.0
6    3.0
7    7.0
dtype: float64

In [329]:
obj.rank(method='first',ascending=False)

0    8.0
1    7.0
2    1.0
3    4.0
4    5.0
5    2.0
6    6.0
7    3.0
dtype: float64

In [330]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],'c': [-2, 5, 8, -2.5]})

In [331]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [333]:
frame.rank(axis='columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [334]:
frame.rank(axis='index')

Unnamed: 0,b,a,c
0,3.0,1.5,2.0
1,4.0,3.5,3.0
2,1.0,1.5,4.0
3,2.0,3.5,1.0


In [335]:
obj.rank(method='dense')

0    1.0
1    2.0
2    7.0
3    5.0
4    4.0
5    6.0
6    3.0
7    6.0
dtype: float64

In [336]:
obj.index.is_unique

True

In [338]:
obj.index.is_monotonic_increasing

True

Data selection is one of the main things that behaves differently with duplicates.
Indexing a label with multiple entries returns a Series, while single entries return a
scalar value:

In [352]:
obj=pd.Series(list('12345'),index=list('abade'))

In [353]:
obj

a    1
b    2
a    3
d    4
e    5
dtype: object

In [354]:
obj.values

array(['1', '2', '3', '4', '5'], dtype=object)

In [355]:
obj.index

Index(['a', 'b', 'a', 'd', 'e'], dtype='object')

In [356]:
obj['a']

a    1
a    3
dtype: object

In [358]:
obj['d']

'4'

In [359]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],index=['a', 'b', 'c', 'd'],columns=['one', 'two'])

In [360]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [361]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [362]:
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [363]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [364]:
df.sum(axis=1,skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [367]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [368]:
df.idxmax()

one    b
two    d
dtype: object

In [369]:
df.idxmin()

one    d
two    b
dtype: object

Some methods, like idxmin and idxmax , return indirect statistics like the index value
where the minimum or maximum values are attained:

In [370]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [371]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [372]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


describe produces multiple summary statistics

In [373]:
obj=pd.Series(list("abcd")*4)

In [374]:
obj

0     a
1     b
2     c
3     d
4     a
5     b
6     c
7     d
8     a
9     b
10    c
11    d
12    a
13    b
14    c
15    d
dtype: object

In [375]:
obj.describe()

count     16
unique     4
top        b
freq       4
dtype: object

##### Corelation and covariance

Some summary statistics, like correlation and covariance, are computed from pairs of
arguments. Let’s consider some DataFrames of stock prices and volumes obtained
from Yahoo! Finance using the add-on pandas-datareader package. If you don’t
have it installed already, it can be obtained via conda or pip:

In [377]:
conda install pandas-datareader

Collecting package metadata: done
Solving environment: done

## Package Plan ##

  environment location: /home/icpl13698/anaconda3

  added / updated specs:
    - pandas-datareader


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.6.14               |           py37_0         2.1 MB
    pandas-datareader-0.7.0    |           py37_0         142 KB
    ------------------------------------------------------------
                                           Total:         2.3 MB

The following NEW packages will be INSTALLED:

  pandas-datareader  pkgs/main/linux-64::pandas-datareader-0.7.0-py37_0

The following packages will be UPDATED:

  conda                                       4.6.11-py37_0 --> 4.6.14-py37_0



Downloading and Extracting Packages
pandas-datareader-0. | 142 KB    | ##################################### | 100% 
conda-4.6.14         | 2.1 MB    | ##################

In [1]:
import pandas_datareader.data as web

In [3]:
all_data={ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

In [4]:
all_data

{'AAPL':                   High         Low        Open       Close       Volume  \
 Date                                                                      
 2009-12-31   30.478571   30.080000   30.447144   30.104286   88102700.0   
 2010-01-04   30.642857   30.340000   30.490000   30.572857  123432400.0   
 2010-01-05   30.798571   30.464285   30.657143   30.625713  150476200.0   
 2010-01-06   30.747143   30.107143   30.625713   30.138571  138040000.0   
 2010-01-07   30.285715   29.864286   30.250000   30.082857  119282800.0   
 2010-01-08   30.285715   29.865715   30.042856   30.282858  111902700.0   
 2010-01-11   30.428572   29.778572   30.400000   30.015715  115557400.0   
 2010-01-12   29.967142   29.488571   29.884285   29.674286  148614900.0   
 2010-01-13   30.132856   29.157143   29.695715   30.092857  151473000.0   
 2010-01-14   30.065714   29.860001   30.015715   29.918571  108223500.0   
 2010-01-15   30.228571   29.410000   30.132856   29.418571  148516900.0   
 201

In [6]:
import pandas as pd
price = pd.DataFrame({ticker: data['Adj Close'] for ticker, data in all_data.items()})

In [8]:
price.head(10)

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-12-31,20.073631,99.303009,24.241983,307.986847
2010-01-04,20.386072,100.478867,24.615801,311.349976
2010-01-05,20.421322,99.265091,24.623755,309.978882
2010-01-06,20.096491,98.620255,24.472631,302.164703
2010-01-07,20.059338,98.27887,24.218124,295.130463
2010-01-08,20.192701,99.265091,24.385145,299.06488
2010-01-11,20.014568,98.225777,24.074966,298.612823
2010-01-12,19.786903,99.007156,23.915895,293.332153
2010-01-13,20.066006,98.794746,24.138592,291.648102
2010-01-14,19.949797,100.372673,24.623755,293.019196


In [12]:
price.head(100).to_csv('Yahoo_stock_data.csv')

In [13]:
volume = pd.DataFrame({ticker: data['Volume'] for ticker, data in all_data.items()})

In [14]:
returns=price.pct_change()

In [15]:
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-05-01,0.049086,0.002067,-0.020827,-0.017165
2019-05-02,-0.006508,-0.006901,-0.013059,-0.004683
2019-05-03,0.012431,0.004728,0.021314,0.019602
2019-05-06,-0.015443,0.000927,-0.005818,0.003366
2019-05-07,-0.026957,-0.019518,-0.020523,-0.012855


In [18]:
returns['MSFT'].corr(returns['IBM'])

0.4857714017978803

In [19]:
returns.MSFT.corr(returns.IBM)

0.4857714017978803

In [20]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.370625,0.448731,0.45592
IBM,0.370625,1.0,0.485771,0.403484
MSFT,0.448731,0.485771,1.0,0.53443
GOOG,0.45592,0.403484,0.53443,1.0


In [21]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.000269,7.5e-05,0.000107,0.000116
IBM,7.5e-05,0.000152,8.7e-05,7.7e-05
MSFT,0.000107,8.7e-05,0.000209,0.000119
GOOG,0.000116,7.7e-05,0.000119,0.000238


Using DataFrame’s corrwith method, you can compute pairwise correlations
between a DataFrame’s columns or rows with another Series or DataFrame. Passing a
Series returns a Series with the correlation value computed for each column:

In [22]:
returns.corrwith(returns.IBM)

AAPL    0.370625
IBM     1.000000
MSFT    0.485771
GOOG    0.403484
dtype: float64

##### Unique Values, Value Counts, and Membership

In [23]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [24]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [31]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [32]:
pd.value_counts(obj.values,sort=False)

b    2
d    1
c    3
a    3
dtype: int64

In [33]:
mask=obj.isin(['b','d'])

In [34]:
mask

0    False
1    False
2     True
3    False
4    False
5     True
6     True
7    False
8    False
dtype: bool

In [35]:
obj[mask]

2    d
5    b
6    b
dtype: object

In [36]:
to_match=pd.Series(list('cabbca'))

In [37]:
unique_vals=pd.Series(list('cba'))

In [40]:
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2])

In [41]:
df1=pd.DataFrame({'price':[1,2,3,4,5],'value':[12,20,30,40,50]},index=list('abcde'))

In [42]:
df1

Unnamed: 0,price,value
a,1,12
b,2,20
c,3,30
d,4,40
e,5,50


In [43]:
df1.describe()

Unnamed: 0,price,value
count,5.0,5.0
mean,3.0,30.4
std,1.581139,15.192103
min,1.0,12.0
25%,2.0,20.0
50%,3.0,30.0
75%,4.0,40.0
max,5.0,50.0
