In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture for Week 3 ##

Note that you can always review the [datascience library documentation](http://data8.org/datascience/) to see what the options are for using the methods described here and in the book. 

## Minard's Visualization

[Charles Minard](https://en.wikipedia.org/wiki/Charles_Joseph_Minard) created the following visualization of Napoleon's Russian campaign.

<img src="minard.png">


In [None]:
minard = Table.read_table('minard.csv')
minard                    

### Selecting data from a column

What's the difference between the following two calls? What do they do? What is the resulting type?

In [None]:
minard.select('Survivors')

In [None]:
minard.column('Survivors')

How can we add a column to the table with the percentage of survivors remaining?

## Working with Census Data

U.S. Constitution. Article I, Section 2
> Representatives and direct Taxes shall be apportioned among the several States which may be included within this Union, according to their respective Numbers . . . The actual Enumeration shall be made within three Years after the first Meeting of the Congress of the United States, and within every subsequent Term of ten Years, in such Manner as they shall by Law direct.”

The Census Bureau estimates the population in intervening years.

In [None]:
full = Table.read_table('nc-est2015-agesex-res.csv')
full

In [None]:
t = full.select('SEX', "AGE", 'CENSUS2010POP', 'POPESTIMATE2015')
t = t.relabeled('POPESTIMATE2015', '2015')
t = t.relabeled('CENSUS2010POP', '2010')
t.set_format(3, NumberFormatter)
males = t.where('AGE', are.below(999)).where('SEX', 1).drop(0)
females = t.where('AGE', are.below(999)).where('SEX', 2).drop(0)
pop_2015 = Table().with_column(
  'Age', males.column(0),
  'Male', males.column(2),
  'Female', females.column(2))
pop_2015

## Categorical Distribution ##

In [None]:
top = Table.read_table('top_movies_2017.csv')
top

What are the top grossing movies by studio?

In [None]:
studios = top.select('Studio')
studios

In [None]:
studio_distribution = studios.group('Studio')

In [None]:
studio_distribution

## Bar Charts ##

In [None]:
studio_distribution.barh('Studio')

## Numerical Distribution ##

In [None]:
ages = 2018 - top.column('Year')
top = top.with_column('Age', ages)

In [None]:
top

## Binning ##

In [None]:
min(ages), max(ages)

In [None]:
my_bins = make_array(0, 5, 10, 15, 25, 40, 65, 100)

In [None]:
top.bin('Age', bins = my_bins)

## Histograms ##

In [None]:
my_bins

In [None]:
top.hist('Age', bins = my_bins, unit = 'Year')

In [None]:
top.hist('Age', bins = my_bins, unit = 'Year', normed = False)

In [None]:
top.hist('Age', bins = np.arange(0, 110, 10), unit = 'Year')