In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Apply

In [None]:
staff = Table().with_columns(
    'Employee', make_array('Jim', 'Dwight', 'Michael', 'Creed'),
    'Birth Year', make_array(1985, 1988, 1967, 1904)
)
staff

In [None]:
def greeting(person):
    return 'Dunder Mifflin, this is ' + person

In [None]:
greeting('Pam')

In [None]:
greeting('Erin')

In [None]:
staff.apply(greeting, 'Employee')

In [None]:
def name_and_age(name, year):
    age = 2019 - year
    return name + ' is ' + str(age)

In [None]:
staff.apply(name_and_age, 'Employee', 'Birth Year')

## Prediction ##

In [None]:
galton = Table.read_table('galton.csv')
galton

In [None]:
galton.scatter('midparentHeight', 'childHeight')

In [None]:
galton.scatter('midparentHeight', 'childHeight')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);

In [None]:
nearby = galton.where('midparentHeight', are.between(67.5, 68.5))
nearby_mean = nearby.column('childHeight').mean()
nearby_mean

In [None]:
galton.scatter('midparentHeight', 'childHeight')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plots.scatter(68, nearby_mean, color='red', s=50);

In [None]:
def predict(h):
    nearby = galton.where('midparentHeight', are.between(h - 1/2, h + 1/2))
    return nearby.column('childHeight').mean()

In [None]:
predict(68)

In [None]:
predict(70)

In [None]:
predict(73)

In [None]:
predicted_heights = galton.apply(predict, 'midparentHeight')
predicted_heights

In [None]:
galton = galton.with_column('predictedHeight', predicted_heights)

In [None]:
galton.select(
    'midparentHeight', 'childHeight', 'predictedHeight').scatter('midparentHeight')

## Prediction Accuracy ##

In [None]:
def difference(x, y):
    return x - y

In [None]:
pred_errs = galton.apply(difference, 'predictedHeight', 'childHeight')
pred_errs

In [None]:
galton = galton.with_column('errors',pred_errs)
galton

In [None]:
galton.hist('errors')

In [None]:
galton.hist('errors', group='gender')

# Discussion Question

In [None]:
def predict_smarter(h, g):
    nearby = galton.where('midparentHeight', are.between(h - 1/2, h + 1/2))
    nearby_same_gender = nearby.where('gender', g)
    return nearby_same_gender.column('childHeight').mean()

In [None]:
predict_smarter(68, 'female')

In [None]:
predict_smarter(68, 'male')

In [None]:
smarter_predicted_heights = galton.apply(predict_smarter, 'midparentHeight', 'gender')
galton = galton.with_column('smartPredictedHeight', smarter_predicted_heights)

In [None]:
smarter_pred_errs = galton.apply(difference, 'childHeight', 'smartPredictedHeight')
galton = galton.with_column('smartErrors', smarter_pred_errs)

In [None]:
galton.hist('smartErrors', group='gender')

## Grouping by One Column ##

In [None]:
cones = Table.read_table('cones.csv')

In [None]:
cones

In [None]:
cones.group('Flavor')

In [None]:
cones.drop('Color').group('Flavor', np.average)

In [None]:
cones.drop('Color').group('Flavor', min)

## Grouping By One Column: Welcome Survey ##

In [None]:
survey = Table.read_table('welcome_survey_v2.csv')

In [None]:
survey.group('Year', np.average)

In [None]:
by_extra = survey.group('Extraversion', np.average)
by_extra

In [None]:
by_extra.select(0,2,3).plot('Extraversion') # Drop the 'Years average' column

In [None]:
by_extra.select(0,3).plot('Extraversion')

## Lists

In [None]:
[1, 5, 'hello', 5.0]

In [None]:
[1, 5, 'hello', 5.0, make_array(1,2,3)]

## Grouping by Two Columns ##

In [None]:
survey = Table.read_table('welcome_survey_v3.csv')

In [None]:
survey.group(['Handedness','Sleep position']).show()

## Pivot Tables

In [None]:
survey.pivot('Sleep position', 'Handedness')

In [None]:
survey.pivot('Sleep position', 'Handedness', values='Extraversion', collect=np.average)

In [None]:
survey.group('Handedness', np.average)

## Lists

In [None]:
simple_list = ['hello', 7, 3.14, True]
simple_list

In [None]:
my_array = make_array(1, 2, 3)

crowded_list = [my_array, 'what is going on', -10]
crowded_list

In [None]:
Table().with_columns('Numbers', [1, 2, 3])

In [None]:
drinks = Table(['Drink', 'Cafe', 'Price'])
drinks

In [None]:
drinks = drinks.with_rows([
    ['Milk Tea', 'Asha', 5.5],
    ['Espresso', 'Strada',  1.75],
    ['Latte',    'Strada',  3.25],
    ['Espresso', "FSM",   2]
])
drinks

## Grouping by one column

In [None]:
survey = Table.read_table('welcome_survey_v4.csv')
survey.show(3)

In [None]:
survey.group('Sleep position').show()

In [None]:
survey.group('Sleep position', np.average)

In [None]:
survey.select('Sleep position', 'Hours of sleep').group('Sleep position', np.average)

## Cross-classification: grouping by two columns

In [None]:
survey.group(['Handedness','Sleep position']).show()

In [None]:
survey.pivot('Sleep position', 'Handedness')

In [None]:
survey.pivot('Sleep position', 'Handedness', 'Hours of sleep', np.average)

In [None]:
(survey.select('Handedness', 'Sleep position', 'Hours of sleep')
       .group(['Handedness','Sleep position'], np.average)).show()

In [None]:
# Here, pivot doesn't know how to combine all the hours of sleep
# for each subgroup of students
survey.pivot('Sleep position', 'Handedness', 'Hours of sleep')

## Challenge Question ##

In [None]:
sky = Table.read_table('skyscrapers_v2.csv')
sky = (sky.with_column('age', 2020 - sky.column('completed'))
          .drop('completed'))
sky.show(3)

In [None]:
# 1. For each city, what’s the tallest building for each material?










In [None]:
# 2. For each city, what’s the height difference between the tallest 
#    steel building and the tallest concrete building?












Don't read ahead until you try the challenge questions yourself first!

In [None]:
sky.select('material', 'city', 'height').group(['city', 'material'], max)

In [None]:
sky_p = sky.pivot('material', 'city', 'height', max)
sky_p.show()

In [None]:
sky_p = sky_p.with_column(
    'difference', 
    abs(sky_p.column('steel') - sky_p.column('concrete'))
)
sky_p

In [None]:
sky_p.sort('difference', True)

### Take-home question: try it here!

In [None]:
# Generate a table of the names of the oldest buildings for each 
# material for each city:



## Joins ##

In [None]:
drinks

In [None]:
discounts = Table().with_columns(
    'Coupon % off', make_array(10, 25, 5),
    'Location', make_array('Asha', 'Strada', 'Asha')
)
discounts

In [None]:
combined = drinks.join('Cafe', discounts, 'Location')
combined

In [None]:
discounted_frac = 1 - combined.column('Coupon % off') / 100
combined.with_column(
    'Discounted Price', 
    combined.column('Price') * discounted_frac
)

In [None]:
drinks.join('Cafe', drinks, 'Cafe')