In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lab 4

## Table Review: Welcome Survey

In [None]:
welcome = Table.read_table('welcome_survey_v1.csv')
welcome.show(5)

In [None]:
# On average, how long do side-sleepers sleep?
side_sleepers = welcome.where('Sleep position', are.containing('side'))
np.average(side_sleepers.column('Hours of sleep'))

In [None]:
# How many students get at least 8 hours of sleep each night (on average)?
# First way:
welcome.where('Hours of sleep', are.above_or_equal_to(8)).num_rows

In [None]:
# Second way
np.count_nonzero(welcome.column('Hours of sleep') >= 8)

In [None]:
# Third way
np.sum(welcome.column('Hours of sleep') >= 8)

In [None]:
# Create a table with only the two sleep-related columns, 
# with names 'Hours' and 'Position'
two_col = welcome.select('Hours of sleep', 'Sleep position')
sleep = two_col.relabeled(0, 'Hours').relabeled(1, 'Position')
sleep.show(3)

In [None]:
# Second way
two_col = welcome.drop('Handedness', 'Extraversion', 'Pant leg')
sleep = two_col.relabeled(0, 'Hours').relabeled(1, 'Position')
sleep.show(3)

## Discussion Question: NBA Salaries

In [None]:
nba = Table.read_table('nba_salaries.csv')
nba = nba.relabeled(3, 'SALARY').drop('TEAM')
nba.show(3)

In [None]:
# Question (a)
guards = nba.where('POSITION', 'PG')
guards.where('SALARY', are.above(15)).column('PLAYER')

In [None]:
# Question (b)
nba.drop('POSITION')
nba.num_columns

## Census ##

In [None]:
full = Table.read_table('nc-est2014-agesex-res.csv')
full

In [None]:
partial = full.select('SEX', 'AGE', 'CENSUS2010POP', 'POPESTIMATE2014')
partial.show(4)

In [None]:
simple = partial.relabeled(2, '2010').relabeled(3, '2014')
simple.show(4)

In [None]:
simple.sort('AGE')

In [None]:
simple.sort('AGE', descending=True)

## Visualization ##

In [None]:
no_999 = simple.where('AGE', are.below(999))
everyone = no_999.where('SEX', 0).drop('SEX')

In [None]:
everyone

In [None]:
everyone.plot('AGE', '2010')

## Census ##

In [None]:
full = Table.read_table('nc-est2014-agesex-res.csv')
full

In [None]:
# Keep only the columns we care about
partial = full.select('SEX', 'AGE', 'POPESTIMATE2010', 'POPESTIMATE2014')
partial

In [None]:
# Make things easier to read
simple = partial.relabeled(2, '2010').relabeled(3, '2014')
simple

In [None]:
# Sort by age
simple.sort('AGE')

In [None]:
# Sort by age (another way)
simple.sort('AGE', descending=True)

## Line Plots ##

In [None]:
# Remove the age totals
no_999 = simple.where('AGE', are.below(999))

In [None]:
# Remove male and female (keep only combined)
everyone = no_999.where('SEX', 0).drop('SEX')

In [None]:
everyone

In [None]:
everyone.plot('AGE', '2010')

In [None]:
# ^^ That plot should be labeled! Here are 3 ways to label it:

In [None]:
# US Population  <--- Just add a comment

everyone.plot('AGE', '2010')

In [None]:
everyone.plot('AGE', '2010')
print('US Population')  # <--- Print out what it is

In [None]:
everyone.plot('AGE', '2010')
plots.title('US Population');    # <--- OPTIONAL; not needed for Data 8

In [None]:
# Age distribution for two different years
everyone.plot('AGE')

## Males and Females in 2014 ##

In [None]:
# Let's compare male and female counts per age
males = no_999.where('SEX', 1).drop('SEX')
females = no_999.where('SEX', 2).drop('SEX')

In [None]:
pop_2014 = Table().with_columns(
    'Age', males.column('AGE'),
    'Males', males.column('2014'),
    'Females', females.column('2014')
)
pop_2014

In [None]:
pop_2014.plot('Age')

In [None]:
# Calculate the percent female for each age
total = pop_2014.column('Males') + pop_2014.column('Females')
pct_female = pop_2014.column('Females') / total * 100
pct_female

In [None]:
# Round it to 3 so that it's easier to read
pct_female = np.round(pct_female, 3)
pct_female

In [None]:
# Add female percent to our table
pop_2014 = pop_2014.with_column('Percent female', pct_female)
pop_2014

In [None]:
pop_2014.plot('Age', 'Percent female')

In [None]:
# ^^ Look at the y-axis! Trend is not as dramatic as you might think
pop_2014.plot('Age', 'Percent female')
plots.ylim(0, 100);  # Optional for Data 8

## Scatter Plots ##

In [None]:
# Actors and their highest grossing movies
actors = Table.read_table('actors.csv')
actors

In [None]:
actors.scatter('Number of Movies', 'Total Gross')

In [None]:
actors.scatter('Number of Movies', 'Average per Movie')

In [None]:
actors.where('Average per Movie', are.above(400))

## Bar Charts ##

In [None]:
# Highest grossing movies as of 2017
top_movies = Table.read_table('top_movies_2017.csv')
top_movies

In [None]:
top10_adjusted = top_movies.take(np.arange(10))
top10_adjusted

In [None]:
# Convert to millions of dollars for readability
millions = np.round(top10_adjusted.column('Gross (Adjusted)') / 1000000, 3)
top10_adjusted = top10_adjusted.with_column('Millions', millions)
top10_adjusted

In [None]:
# A line plot doesn't make sense here: don't do this!
top10_adjusted.plot('Year', 'Millions')

In [None]:
top10_adjusted.barh('Title', 'Millions')

In [None]:
# Generate the chart shown in the slides:
# bar chart of age (# years since release) for the 10 highest grossing movies (non-adjusted)

## Categorical Distribution ##

In [None]:
top_movies = Table.read_table('top_movies_2017.csv')
top_movies

In [None]:
top_movies = top_movies.with_column('Millions', np.round(top_movies.column('Gross')/1000000,3))
top_movies.take(np.arange(10)).barh('Title', 'Millions')

In [None]:
studios = top_movies.select('Studio')
studios

In [None]:
studio_distribution = studios.group('Studio')

In [None]:
studio_distribution

In [None]:
sum(studio_distribution.column('count'))

## Bar Charts ##

In [None]:
studio_distribution.barh('Studio')

In [None]:
studio_distribution.sort('count', descending=True).barh('Studio')

## Numerical Distribution ##

In [None]:
ages = 2019 - top_movies.column('Year')
top_movies = top_movies.with_column('Age', ages)

In [None]:
top_movies

## Binning ##

In [None]:
min(ages), max(ages)

In [None]:
my_bins = make_array(0, 5, 10, 15, 25, 40, 65, 100)

In [None]:
binned_data = top_movies.bin('Age', bins = my_bins)
binned_data

In [None]:
sum(binned_data.column('Age count'))

In [None]:
top_movies.bin('Age', bins = np.arange(0, 101, 25))

In [None]:
top_movies.bin('Age', bins = np.arange(0, 60, 25))

In [None]:
top_movies.where('Age', 50)

## Histograms ##

In [None]:
my_bins

In [None]:
binned_data

In [None]:
# Let's make our first histogram!
top_movies.hist('Age', bins = my_bins, unit = 'Year')

In [None]:
# Let's try equally spaced bins instead.
top_movies.hist('Age', bins = np.arange(0, 110, 10), unit = 'Year')

In [None]:
# Let's try not specifying any bins!
top_movies.hist('Age', unit='Year')

In [None]:
# Add a column containing what percent of movies are in each bin
binned_data = binned_data.with_column(
    'Percent', 100*binned_data.column('Age count')/200)

In [None]:
binned_data

## Height ##

### Question: What is the height of the [40, 65] bin?

In [None]:
# Step 1: Calculate % of movies in the [40, 65) bin
percent = binned_data.where('bin', 40).column('Percent').item(0)

In [None]:
# Step 2: Calculate the width of the 40-65 bin
width = 65 - 40

In [None]:
# Step 3: Area of rectangle = height * width
#         --> height = percent / width
height = percent / width
height

### What are the heights of the rest of the bins?

In [None]:
# Get the bin lefts
bin_lefts = binned_data.take(np.arange(binned_data.num_rows - 1))

In [None]:
# Get the bin widths
bin_widths = np.diff(binned_data.column('bin'))
bin_lefts = bin_lefts.with_column('Width', bin_widths)

In [None]:
# Get the bin heights
bin_heights = bin_lefts.column('Percent') / bin_widths
bin_lefts = bin_lefts.with_column('Height', bin_heights)

In [None]:
bin_lefts

In [None]:
top_movies.hist('Age', bins = my_bins, unit = 'Year')

## Visualization Review: Welcome Survey ##

In [None]:
survey = Table.read_table('welcome_survey_v1.csv')
survey

### Categorical Bar Charts

In [None]:
handedness = survey.group('Handedness')
handedness

In [None]:
handedness.barh('Handedness')

### Numerical Histograms

In [None]:
survey.hist('Extraversion')

In [None]:
survey.hist('Hours of sleep')

In [None]:
max(survey.column('Hours of sleep'))

In [None]:
sleep_bins = np.arange(4,12,0.5)

In [None]:
survey.where(
    'Pant leg',are.containing('Right')).hist('Hours of sleep', bins=sleep_bins)
plots.title('Right Leg First');

survey.where(
    'Pant leg',are.containing('Left')).hist('Hours of sleep', bins=sleep_bins)
plots.title('Left Leg First');

In [None]:
survey.hist('Hours of sleep', bins=sleep_bins)

In [None]:
survey.bin('Hours of sleep', bins=make_array(0,8,15))

In [None]:
417/(417 + 915) * 100

## Histogram Review: Heights and Heredity ##

In [None]:
galton = Table.read_table('galton.csv')
galton

In [None]:
galton.hist('midparentHeight')

In [None]:
galton.hist('childHeight')

In [None]:
galton.hist('midparentHeight', 'childHeight')

## Functions ##

In [None]:
def triple(x):
    return 3 * x

In [None]:
triple(3)

In [None]:
num = 4

In [None]:
triple(num)

In [None]:
triple(num * 5)

### Note About Scopes

In [None]:
x

In [None]:
x = 5

In [None]:
triple(2 * x)

In [None]:
x

### Type Agnostic

In [None]:
triple('ha')

In [None]:
triple(np.arange(4))

### Discussion Question

In [None]:
def percent_of_total(s):
    return np.round(s / sum(s) * 100, 2)

In [None]:
percent_of_total(make_array(1,2,3,4))

In [None]:
percent_of_total(make_array(1, 213, 38))

### Multiple Arguments

$ h^2 = x^2 + y^2 \hspace{20 pt} => \hspace{20 pt} h = \sqrt{ x^2 + y^2 } $

In [None]:
def hypotenuse(x,y):
    hypot_squared = (x ** 2 + y ** 2)
    return hypot_squared ** 0.5

In [None]:
hypotenuse(9, 12)

In [None]:
hypotenuse(2, 2)

## Apply ##

In [None]:
ages = Table().with_columns(
    'Person', make_array('Jim', 'Pam', 'Michael', 'Creed'),
    'Birth Year', make_array(1985, 1988, 1967, 1904)
)
ages

In [None]:
def cap_at_1980(x):
    return min(x, 1980)

In [None]:
cap_at_1980(1975)

In [None]:
cap_at_1980(1991)

In [None]:
ages.apply(cap_at_1980, 'Birth Year')

In [None]:
def name_and_age(name, year):
    age = 2019 - year
    return name + ' is ' + str(age)

In [None]:
ages.apply(name_and_age, 'Person', 'Birth Year')