In [1]:
import pandas as pd;

In [2]:
s = pd.Series(["Central", "Chatswood", "Redfern"])

In [3]:
s

0      Central
1    Chatswood
2      Redfern
dtype: object

In [4]:
population = pd.Series(data=[5500000, 5400000, 1000000], index = ["Sydney", "Melbourne", "Adelaide"])

In [5]:
population

Sydney       5500000
Melbourne    5400000
Adelaide     1000000
dtype: int64

In [6]:
adult_population = pd.Series(data=[600000, 3500000, 2400000, 700000], index = ["Adelaide","Sydney", "Melbourne", "Perth"])

In [7]:
adult_population

Adelaide      600000
Sydney       3500000
Melbourne    2400000
Perth         700000
dtype: int64

In [8]:
child_population = population - adult_population # the corresponding indices are subtracted, 
#mismatching indices get a NaN

In [9]:
child_population

Adelaide      400000.0
Melbourne    3000000.0
Perth              NaN
Sydney       2000000.0
dtype: float64

In [10]:
adult_population["Wollongong"] = 100

In [11]:
type(adult_population[0:1])

pandas.core.series.Series

In [12]:
adult_population > 200

Adelaide       True
Sydney         True
Melbourne      True
Perth          True
Wollongong    False
dtype: bool

In [13]:
adult_population[adult_population > 200] # boolean indexing

Adelaide      600000
Sydney       3500000
Melbourne    2400000
Perth         700000
dtype: int64

In [14]:
not_tiny_cities = adult_population > 200
not_big_cities = adult_population < 1000000

In [15]:
not_big_cities & not_tiny_cities # can do indexwise logical operations, use | for the OR

Adelaide       True
Sydney        False
Melbourne     False
Perth          True
Wollongong    False
dtype: bool

In [16]:
adult_population[not_big_cities & not_tiny_cities] # can index

Adelaide    600000
Perth       700000
dtype: int64

## Note that these are very similar to WHERE clauses in SQL

In [17]:
population

Sydney       5500000
Melbourne    5400000
Adelaide     1000000
dtype: int64

In [18]:
population.mean()

3966666.6666666665

In [19]:
population.median()

5400000.0

In [20]:
population.idxmax() # get index of max/min

'Sydney'

In [21]:
population[adult_population < 3000000].idxmax()

'Melbourne'

In [22]:
population.value_counts() # silly here, but good for repeated values

1000000    1
5400000    1
5500000    1
dtype: int64

## DataFrames

In [23]:
population['Canberra'] = 100000
population['Wingello'] =  300

In [24]:
df = pd.DataFrame(data={'Population': population, 'Adults' : adult_population})

In [25]:
df

Unnamed: 0,Adults,Population
Adelaide,600000.0,1000000.0
Canberra,,100000.0
Melbourne,2400000.0,5400000.0
Perth,700000.0,
Sydney,3500000.0,5500000.0
Wingello,,300.0
Wollongong,100.0,


In [26]:
df.head() # first few

Unnamed: 0,Adults,Population
Adelaide,600000.0,1000000.0
Canberra,,100000.0
Melbourne,2400000.0,5400000.0
Perth,700000.0,
Sydney,3500000.0,5500000.0


In [27]:
df.tail(2) # get last 2

Unnamed: 0,Adults,Population
Wingello,,300.0
Wollongong,100.0,


In [28]:
df.sample(3) # random sample of 3, these are good to check the data looks reasonable

Unnamed: 0,Adults,Population
Canberra,,100000.0
Melbourne,2400000.0,5400000.0
Wingello,,300.0


In [29]:
df.columns # get the columns

Index([u'Adults', u'Population'], dtype='object')

In [30]:
type(df.columns)

pandas.indexes.base.Index

In [31]:
df.Adults # get a column

Adelaide       600000.0
Canberra            NaN
Melbourne     2400000.0
Perth          700000.0
Sydney        3500000.0
Wingello            NaN
Wollongong        100.0
Name: Adults, dtype: float64

In [32]:
type(df.Adults)

pandas.core.series.Series

In [33]:
df['Population']['Sydney'] # can do this

5500000.0

In [34]:
df['Population'].Sydney # or this

5500000.0

In [35]:
df['Kids'] = df.Population - df.Adults
df['Mature Population'] = df.Adults > 2000000

In [36]:
df

Unnamed: 0,Adults,Population,Kids,Mature Population
Adelaide,600000.0,1000000.0,400000.0,False
Canberra,,100000.0,,False
Melbourne,2400000.0,5400000.0,3000000.0,True
Perth,700000.0,,,False
Sydney,3500000.0,5500000.0,2000000.0,True
Wingello,,300.0,,False
Wollongong,100.0,,,False


In [37]:
states = pd.Series(data=["NSW","NSW","VIC", "WA"], index=["Sydney", "Wollongong", "Melbourne", "Perth"])

In [38]:
df['State'] = states

In [39]:
df

Unnamed: 0,Adults,Population,Kids,Mature Population,State
Adelaide,600000.0,1000000.0,400000.0,False,
Canberra,,100000.0,,False,
Melbourne,2400000.0,5400000.0,3000000.0,True,VIC
Perth,700000.0,,,False,WA
Sydney,3500000.0,5500000.0,2000000.0,True,NSW
Wingello,,300.0,,False,
Wollongong,100.0,,,False,NSW


In [40]:
df.groupby('State').Adults.sum()

State
NSW    3500100.0
VIC    2400000.0
WA      700000.0
Name: Adults, dtype: float64

In [41]:
pd.qcut(df.Population, 3)

Adelaide       (400000, 3933333.333]
Canberra               [300, 400000]
Melbourne     (3933333.333, 5500000]
Perth                            NaN
Sydney        (3933333.333, 5500000]
Wingello               [300, 400000]
Wollongong                       NaN
Name: Population, dtype: category
Categories (3, object): [[300, 400000] < (400000, 3933333.333] < (3933333.333, 5500000]]

In [42]:
pd.qcut(df.Population, 3).value_counts()

(3933333.333, 5500000]    2
[300, 400000]             2
(400000, 3933333.333]     1
Name: Population, dtype: int64

In [43]:
pd.qcut?

In [44]:
pd.cut(df.Population,10)

Adelaide       (550270, 1100240]
Canberra       (-5199.7, 550270]
Melbourne     (4950030, 5500000]
Perth                        NaN
Sydney        (4950030, 5500000]
Wingello       (-5199.7, 550270]
Wollongong                   NaN
Name: Population, dtype: category
Categories (10, object): [(-5199.7, 550270] < (550270, 1100240] < (1100240, 1650210] < (1650210, 2200180] ... (3300120, 3850090] < (3850090, 4400060] < (4400060, 4950030] < (4950030, 5500000]]

In [45]:
pd.cut(df.Population,10).value_counts()

(4950030, 5500000]    2
(-5199.7, 550270]     2
(550270, 1100240]     1
(4400060, 4950030]    0
(3850090, 4400060]    0
(3300120, 3850090]    0
(2750150, 3300120]    0
(2200180, 2750150]    0
(1650210, 2200180]    0
(1100240, 1650210]    0
Name: Population, dtype: int64

In [46]:
state_populations = pd.Series(data=[30000000, 20000000],index=['NSW','VIC'])

In [47]:
state_populations.index

Index([u'NSW', u'VIC'], dtype='object')

In [49]:
state_dateframe = pd.DataFrame(state_populations)

In [50]:
state_dateframe

Unnamed: 0,0
NSW,30000000
VIC,20000000
