In [1]:
# we are going to use Series and Dataframe from pandas a lot, import them directly into the name space
from pandas import Series, DataFrame
import pandas as pd  # and the rest of pandas in pd space

In [2]:
#
# S E R I E S
#

In [3]:
# a Series is a one dimensional array with more fucntionality defined on top
# or like a fixed length, ordered dictionary
obj = Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
# notice that we have a default index shown in the left
# some methods are predifined
?obj

In [5]:
obj.values

array([ 4,  7, -5,  3])

In [6]:
obj.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [7]:
# can define series with interesting indices
obj2 = Series([4,7,-5,3], index=['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [8]:
# unlike an array, you can use the indices to pick values
obj2['a']

-5

In [9]:
obj2[['c','a','d']] # list of indices

c    3
a   -5
d    4
dtype: int64

In [10]:
# index value links are maintained across operations
obj2[obj2>0]

d    4
b    7
c    3
dtype: int64

In [11]:
obj2*2

d     8
b    14
a   -10
c     6
dtype: int64

In [12]:
import numpy as np
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [13]:
# check for index membership
'b' in obj2

True

In [14]:
# a dictionary can be used to create a Series
# the Series will have the data in index sorted order
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah':5000}
obj3 = Series(sdata)
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [15]:
# Series can also merge index sets properly
# note that NaN (not a number) indicates null values
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)
obj4

California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64

In [16]:
# isnull and notnull functions can be used
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [17]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [18]:
# there is also a isnull instance method for Series
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [19]:
# one of the cool features of Series is data-alignment based on index
obj3 + obj4

California       NaN
Ohio           70000
Oregon         32000
Texas         142000
Utah             NaN
dtype: float64

In [20]:
# you can also name the index and value
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California      NaN
Ohio          35000
Oregon        16000
Texas         71000
Name: population, dtype: float64

In [21]:
#
# D A T A F R A M E 
#

In [22]:
# DataFrame is like data.frame in R
#  but more powerful
#  let us make a DataFrame using a dictionary
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [23]:
len(frame)   # number of rows

5

In [24]:
len(frame.keys())  # number of columns

3

In [25]:
frame.shape # rows and columns

(5, 3)

In [26]:
frame.describe()  # summarize numeric variable

Unnamed: 0,pop,year
count,5.0,5.0
mean,2.42,2001.2
std,0.864292,0.83666
min,1.5,2000.0
25%,1.7,2001.0
50%,2.4,2001.0
75%,2.9,2002.0
max,3.6,2002.0


In [27]:
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [28]:
# notice that the columns are in sorted order of name
# now let us make the DataFrame in a different order
DataFrame(data, columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [29]:
# can retrieve a column by name
frame['state']

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object

In [30]:
# or using the dot attribute notation  (like $ in R)
frame.state

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object

In [31]:
# rows can be retrieved by index using the ix attribute
frame.ix[2]

pop       3.6
state    Ohio
year     2002
Name: 2, dtype: object

In [32]:
# ix attribute is also needed to get a subset of rows and columns
frame.ix[0:2,["state","pop"]]

Unnamed: 0,state,pop
0,Ohio,1.5
1,Ohio,1.7
2,Ohio,3.6


In [33]:
# select on year
frame.ix[frame.year==2001,["state","pop"]]

Unnamed: 0,state,pop
1,Ohio,1.7
3,Nevada,2.4


In [34]:
?np.arange  # only the stop value is mandatory

In [35]:
# can modify DataFrame column by assignment
frame['debt'] = np.arange(5)

In [36]:
frame

Unnamed: 0,pop,state,year,debt
0,1.5,Ohio,2000,0
1,1.7,Ohio,2001,1
2,3.6,Ohio,2002,2
3,2.4,Nevada,2001,3
4,2.9,Nevada,2002,4


In [37]:
# can also create columns using data functions
frame['easter'] = frame.state == 'Ohio'
frame

Unnamed: 0,pop,state,year,debt,easter
0,1.5,Ohio,2000,0,True
1,1.7,Ohio,2001,1,True
2,3.6,Ohio,2002,2,True
3,2.4,Nevada,2001,3,False
4,2.9,Nevada,2002,4,False


In [38]:
del frame['easter']  # delete a column from DataFrame

In [39]:
frame

Unnamed: 0,pop,state,year,debt
0,1.5,Ohio,2000,0
1,1.7,Ohio,2001,1
2,3.6,Ohio,2002,2
3,2.4,Nevada,2001,3
4,2.9,Nevada,2002,4


In [40]:
# reindexing = change order of index
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [41]:
obj5 = obj2.reindex(['a', 'b','c','d','e'], fill_value=0)
obj5

a   -5
b    7
c    3
d    4
e    0
dtype: int64

In [42]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [43]:
# can also do forward fill for numeric indices
#  ffill is same as pad
#  bfill fills backwards
obj6 = obj.reindex(range(5), method='ffill')
obj6

0    4
1    7
2   -5
3    3
4    3
dtype: int64

In [44]:
#
# FUNCTIONS AND MAPPING
#

In [45]:
# functions and mapping
# first make a numeric frame
frame2 = DataFrame(np.random.randn(4,3),  # four rows and three columns of random normals
                  columns=['b','d','e'],    # names for the three columns are b, d, e
                  index = ['Utah','Ohio','Texas','Oregon'])  # the row names i.e., indexes
frame2

Unnamed: 0,b,d,e
Utah,1.607534,0.137393,-0.929921
Ohio,-0.974814,-0.210417,-0.908454
Texas,0.484713,-0.341808,0.615813
Oregon,-0.028019,-0.793875,-1.068275


In [46]:
np.abs(frame2)  # take absolute value of all
# other functions: http://docs.scipy.org/doc/numpy-1.10.0/reference/routines.math.html

Unnamed: 0,b,d,e
Utah,1.607534,0.137393,0.929921
Ohio,0.974814,0.210417,0.908454
Texas,0.484713,0.341808,0.615813
Oregon,0.028019,0.793875,1.068275


In [47]:
# some instance methods are built in
# max, min, mean, mode, median, count, sum, abs, std, var, skew, kurt, quantile(q) 
# see http://pandas.pydata.org/pandas-docs/stable/basics.html for more 
frame2.max()

b    1.607534
d    0.137393
e    0.615813
dtype: float64

In [48]:
frame2.min()

b   -0.974814
d   -0.793875
e   -1.068275
dtype: float64

In [49]:
# now we create a lambda function for range
r = lambda x: x.max() - x.min()
# instance method apply is useful to map these
frame2.apply(r) # default applied to columns


b    2.582348
d    0.931268
e    1.684088
dtype: float64

In [50]:
# to apply it to rows
frame2.apply(r, axis=1)

Utah      2.537455
Ohio      0.764397
Texas     0.957621
Oregon    1.040256
dtype: float64

In [51]:
# apply can also return series
def f(x): return Series([x.min(), x.max()], index=['min','max'])
frame2.apply(f)

Unnamed: 0,b,d,e
min,-0.974814,-0.793875,-1.068275
max,1.607534,0.137393,0.615813


In [52]:
# format operator %
# 'format pattern' % what to format
name = "John"
print 'Hello %s!' % name


Hello John!


In [53]:
year = 23
print "Greetings %s on your %drd birthday!" % (name, year)

Greetings John on your 23rd birthday!


In [54]:
pi = 3.14159623
print "Eat the %f" % pi

Eat the 3.141596


In [55]:
print "Eat the %.3f" % pi   # notice the rounding

Eat the 3.142


In [56]:
# now suppose we want to print frame2 up to 2 decimal places
# applymap applies a function to each element of the dataframe (rather than aggregating)
myf = lambda x: '%.2f' % x
frame2.applymap(myf)

Unnamed: 0,b,d,e
Utah,1.61,0.14,-0.93
Ohio,-0.97,-0.21,-0.91
Texas,0.48,-0.34,0.62
Oregon,-0.03,-0.79,-1.07


In [57]:
#
# SORTING
#

In [58]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [59]:
# sort by index
obj2.sort_index()

a   -5
b    7
c    3
d    4
dtype: int64

In [60]:
# sort by value, in descending order
obj2.sort_values(ascending=False)   # F is not a valid shortcut for False in Python

b    7
d    4
c    3
a   -5
dtype: int64