# Ch6. Working with Data Series and Frames

<div id="toc"></div>

## Unit31_Getting Used to Pandas Data Structures

### Series

In [27]:
### import pandas as pd
# The last value is wrong, we will fix it later!
inflation = pd.Series((2.2, 3.4, 2.8, 1.6, 2.3, 2.7, 3.4, 3.2, 2.8, 3.8,
                       -0.4, 1.6, 3.2, 2.1, 1.5, 1.5))

In [22]:
inflation

0     2.2
1     3.4
2     2.8
3     1.6
4     2.3
5     2.7
6     3.4
7     3.2
8     2.8
9     3.8
10   -0.4
11    1.6
12    3.2
13    2.1
14    1.5
15    1.5
dtype: float64

In [3]:
len(inflation)

16

In [4]:
inflation.index

RangeIndex(start=0, stop=16, step=1)

In [5]:
inflation.values[-1] = 1.6

In [8]:
import numpy as np

In [10]:
inflation = pd.Series({1999 : 2.2, 2014 : 1.6, 2015 : np.nan})
inflation

1999    2.2
2014    1.6
2015    NaN
dtype: float64

In [23]:
inflation.index = pd.Index(range(1999, 2015))
inflation[2015] = np.nan

In [24]:
inflation

1999    2.2
2000    3.4
2001    2.8
2002    1.6
2003    2.3
2004    2.7
2005    3.4
2006    3.2
2007    2.8
2008    3.8
2009   -0.4
2010    1.6
2011    3.2
2012    2.1
2013    1.5
2014    1.5
2015    NaN
dtype: float64

In [16]:
inflation.index.name = "Year"
inflation.name = "%"

In [17]:
inflation

Year
1999    2.2
2000    3.4
2001    2.8
2002    1.6
2003    2.3
2004    2.7
2005    3.4
2006    3.2
2007    2.8
2008    3.8
2009   -0.4
2010    1.6
2011    3.2
2012    2.1
2013    1.5
2014    1.5
2015    NaN
Name: %, dtype: float64

In [25]:
inflation.head()

1999    2.2
2000    3.4
2001    2.8
2002    1.6
2003    2.3
dtype: float64

In [26]:
inflation.tail()

2011    3.2
2012    2.1
2013    1.5
2014    1.5
2015    NaN
dtype: float64

### Frames

In [36]:
alco2009 = pd.DataFrame([(1.20, 0.22, 0.58),
                         (1.31, 0.54, 1.16),
                         (1.19, 0.38, 0.74)],
                         columns=( "Beer" , "Wine" , "Spirits" ),
                         index=( "Alabama" , "Alaska", "Arizona"))

In [37]:
alco2009

Unnamed: 0,Beer,Wine,Spirits
Alabama,1.2,0.22,0.58
Alaska,1.31,0.54,1.16
Arizona,1.19,0.38,0.74


In [None]:
alco2009 = pd.DataFrame({"Beer" : (1.20, 1.31, 1.19, «more rows»),
                         "Wine" : (0.22, 0.54, 0.38, «more rows»),
                         "Spirits" : (0.58, 1.16, 0.74, «more rows»)},
                        index=( "Alabama" , "Alaska" , «more states»))

In [38]:
alco2009[ "Wine" ].head()

Alabama    0.22
Alaska     0.54
Arizona    0.38
Name: Wine, dtype: float64

In [39]:
alco2009.Beer.tail()

Alabama    1.20
Alaska     1.31
Arizona    1.19
Name: Beer, dtype: float64

In [40]:
alco2009[ "Total" ] = 0
alco2009.head()

Unnamed: 0,Beer,Wine,Spirits,Total
Alabama,1.2,0.22,0.58,0
Alaska,1.31,0.54,1.16,0
Arizona,1.19,0.38,0.74,0


## Unit32_Reshaping Data

### Indexing

In [41]:
alco2009.columns.values

array(['Beer', 'Wine', 'Spirits', 'Total'], dtype=object)

In [42]:
alco2009.index.values

array(['Alabama', 'Alaska', 'Arizona'], dtype=object)

In [43]:
alco2009.reset_index().set_index( "Beer" ).head()

Unnamed: 0_level_0,index,Wine,Spirits,Total
Beer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.2,Alabama,0.22,0.58,0
1.31,Alaska,0.54,1.16,0
1.19,Arizona,0.38,0.74,0


In [None]:
alco2009.ix["Nebraska"]

In [None]:
"Samoa" in alco2009.index

### Reindexing

In [45]:
s_states = [state for state in alco2009.index if state[0] == 'S' ] + [ "Samoa" ]
drinks = list(alco2009.columns) + [ "Water" ]
nan_alco = alco2009.reindex(s_states, columns=drinks)

### Hierarchical Indexing

In [None]:
multi = pd.MultiIndex.from_tuples((
    ( "Alabama" , 1977), ( "Alabama" , 1978), ( "Alabama" , 1979), ...,
    ( "Wyoming" , 2009)),
    names=[ "State" , "Year" ])

In [None]:
alco.index = multi

In [None]:
alco.ix['Wyoming'].head()

In [None]:
alco.ix['Wyoming', 1999]

### Stacking and Pivoting

In [None]:
tall_alco = alco.stack()
tall_alco.index.names += [ "Drink" ]
tall_alco.head(10)

In [None]:
wide_alco = alco.unstack()
wide_alco.head()

In [None]:
alco.pivot( "Year" , "State" , "Wine" )

## Unit33_Handling Missing Data

### Deleting Missing Data

In [None]:
nan_alco.dropna(how="all" )

In [None]:
nan_alco.dropna(how="all" , axis=1)

In [None]:
nan_alco.dropna()

### Imputing Missing Data

In [None]:
nan_alco.isnull()

In [None]:
nan_alco.notnull()

In [None]:
sp = nan_alco[ 'Spirits' ] # Selected a column with dirty rows
clean = sp.notnull() # The clean rows
sp[-clean] = sp[clean].mean() # Impute the clean mean into the dirty rows
nan_alco

In [None]:
nan_alco.fillna(0)

In [None]:
nan_alco.fillna(method="ffill" )

### Replacing Values

## Unit34_Combining Data

### Merging

In [None]:
df = pd.merge(df1, df2, on="key" )
df = pd.merge(df1, df2, left_on="key1" , right_on="key2" )

In [None]:
df = pd.merge(alco2009.reset_index(),
              population.reset_index()).set_index( "State" )
df.head()

In [None]:
df = pd.merge(alco2009, population, left_index=True, right_index=True)
df.head()

In [None]:
population.join(alco2009).tail(10)

### Concatenating

In [None]:
pd.concat([alco2009, population], axis=1).tail()

In [None]:
pop_na = pd.concat([population, pop_ca], keys=[ "US" , "CA" ])
pop_na.index.names = ( "Country" , "State" )

### Deleting Duplicates

## Unit35_Ordering and Describing Data

### Sorting and Ranking

In [None]:
population.sort_index().head()

In [None]:
population.sort_values( "Population" ).head()

In [None]:
pop_by_state = population.sort_index()
pop_by_state.rank().head()

### Descriptive Statistics

In [None]:
alco2009.max()

In [None]:
alco2009.min(axis=1)

In [None]:
alco2009.sum()

In [None]:
alco.ix[ 'Hawaii' ].cumsum().head()

In [None]:
alco.ix[ 'Hawaii' ].diff().head()

### Uniqueness, Counting, Membership

In [None]:
dna = "AGTCCGCGAATACAGGCTCGGT"
dna_as_series = pd.Series(list(dna), name="genes" )
dna_as_series.head()

In [None]:
dna_as_series.unique()

In [None]:
dna_as_series.value_counts().sort_index()

In [None]:
valid_nucs = list( "ACGT" )
dna_as_series.isin(valid_nucs).all()

## Unit36_Transforming Data

### Arithmetic Operations

In [None]:
alco[ "Total" ] = alco.Wine + alco.Spirits + alco.Beer
alco.head()

In [None]:
np.log10(alco.Total).head()

In [None]:
dna = "AGTCCGCGAATACAGGCTCGGT"
dna1 = dna.replace( "C" , "" )
dna2 = dna.replace( "T" , "" )
dna_as_series1 = pd.Series(list(dna1), name="genes" ) # No C's
dna_as_series2 = pd.Series(list(dna2), name="genes" ) # No T's
dna_as_series1.value_counts() + dna_as_series2.value_counts()

### Data Aggregation

In [None]:
# We want to group by the "Year" column
alco_noidx = alco.reset_index()
sum_alco = alco_noidx.groupby( "Year" ).sum()
sum_alco.tail()

In [None]:
for year, year_frame in alco_noidx.groupby( "Year" ):
    «do_something(year, year_frame)»

In [None]:
state2reg

In [None]:
alco2009.groupby(state2reg).mean()

### Discretization

In [None]:
cats = pd.cut(alco2009[ 'Wine' ], 3).head()

In [None]:
cats = pd.cut(alco2009[ 'Wine' ], 3, labels=( "Low" , "Moderate" , "Heavy"'))
cats.head()

In [None]:
cats = pd.cut(alco2009[ 'Wine' ], 3, labels=False).head()

In [None]:
quants = pd.qcut(alco2009[ 'Wine' ], 3, labels=( "Low" , "Moderate" , "Heavy" ))
quants.head()

In [None]:
pd.get_dummies(state2reg).sort_index().head()

### Mapping

In [None]:
with_state = alco2009.reset_index()
abbrevs = with_state[ "State" ].map( lambda x: x[:3].upper())
abbrevs.head()

### Cross-Tabulation

In [None]:
wine_state = alco2009[ "Wine" ] > alco2009[ "Wine" ].mean()
beer_state = alco2009[ "Beer" ] > alco2009[ "Beer" ].mean()
pd.crosstab(wine_state, beer_state)

## Unit37_Taming Pandas File I/O

### Reading CSV and Tabular Files

In [None]:
Northeast,New England,Connecticut
,,Maine
,,Massachusetts
,,New Hampshire
,,Rhode Island
,,Vermont
Northeast,Mid-Atlantic,New Jersey
,,New York
,,Pennsylvania
«more states»

In [None]:
regions = pd.read_csv( "code/regions.csv" ,
                      header=None,
                      names=( "region" , "division" , "state" ))
state2reg_series = regions.ffill().set_index( "state" )[ "region" ]
state2reg_series.head()

In [None]:
tate2reg = state2reg_series.to_dict()

### Chunking

In [None]:
chunker = pd.read_csv( "code/regions_clean.csv" , chunksize=5,
                      header=None, names=( "region" , "division" , "state" ))
accum = pd.Series()

for piece in chunker:
    counts = piece[ "region" ].value_counts()
    accum = accum.add(counts, fill_value=0)
    
accum

### Reading Other Files

## Your Turn

* http://vincentarelbundock.github.io/Rdatasets/csv/datasets/lynx.csv  
* http://en.wikipedia.org/wiki/List_of_countries_by_alcohol_consumption_per_capita  
* http://en.wikipedia.org/wiki/List_of_countries_by_GDP_(PPP)_per_capita  
* http://www.ncdc.noaa.gov/cdo-web/  