In [1]:
# HIDDEN
import warnings
# Ignore numpy dtype warnings. These warnings are caused by an interaction
# between numpy and Cython and can be safely ignored.
# Reference: https://stackoverflow.com/a/40846742
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import nbinteract as nbi

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.options.display.max_rows = 7
pd.options.display.max_columns = 8
pd.set_option('precision', 2)
# This option stops scientific notation for pandas
# pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
def df_interact(df, nrows=7, ncols=7):
    '''
    Outputs sliders that show rows and columns of df
    '''
    def peek(row=0, col=0):
        return df.iloc[row:row + nrows, col:col + ncols]

    row_arg = (0, len(df), nrows) if len(df) > nrows else fixed(0)
    col_arg = ((0, len(df.columns), ncols)
               if len(df.columns) > ncols else fixed(0))
    
    interact(peek, row=row_arg, col=col_arg)
    print('({} rows, {} columns) total'.format(df.shape[0], df.shape[1]))

## Data Types

In [14]:
scores = pd.read_csv('data/SFBusinesses/inspections.csv')
scores

Unnamed: 0,business_id,score,date,type
0,19,94,20160513,routine
1,19,94,20171211,routine
2,24,98,20171101,routine
...,...,...,...,...
14219,94142,100,20171220,routine
14220,94189,96,20171130,routine
14221,94231,85,20171214,routine


In [10]:
housing = pd.read_csv('data/SFHousing.csv')
housing

Unnamed: 0.1,Unnamed: 0,county,city,zip,...,lat,quality,match,wk
0,1,Alameda County,Alameda,94501.0,...,37.76,gpsvisualizer,Exact,2003-04-21
1,2,Alameda County,Alameda,94501.0,...,37.76,QUALITY_ADDRESS_RANGE_INTERPOLATION,Exact,2003-04-21
2,3,Alameda County,Alameda,94501.0,...,37.77,QUALITY_ADDRESS_RANGE_INTERPOLATION,Exact,2003-04-21
...,...,...,...,...,...,...,...,...,...
281503,348191,Sonoma County,Sonoma,95476.0,...,38.28,QUALITY_CITY_CENTROID,Exact,2006-05-29
281504,348192,Sonoma County,Windsor,95492.0,...,38.55,QUALITY_EXACT_PARCEL_CENTROID,Relaxed; Soundex,2006-05-29
281505,348193,Sonoma County,Windsor,95492.0,...,38.54,QUALITY_CITY_CENTROID,Exact,2006-05-29


In [8]:
babies = pd.read_csv('data/babies23.data', delimiter='\s+')
babies

Unnamed: 0,id,pluralty,outcome,date,...,inc,smoke,time,number
0,15,5,1,1411,...,1,0,0,0
1,20,5,1,1499,...,4,0,0,0
2,58,5,1,1576,...,2,1,1,1
...,...,...,...,...,...,...,...,...,...
1233,9213,5,1,1672,...,3,1,1,2
1234,9229,5,1,1680,...,1,0,0,0
1235,9263,5,1,1668,...,6,0,0,0


In [4]:
!cat data/babies.readme

Variables in data file
1. id - identification number
2. pluralty -  5= single fetus
3. outcome - 1= live birth that survived at least 28 days
4. date - birth date where 1096=January1,1961 
5. gestation - length of gestation in days
6. sex - infant's sex 1=male 2=female 9=unknown
7. wt -  birth weight in ounces (999 unknown)
8. parity - total number of previous pregnancies including
   fetal deaths and still births, 99=unknown
9. race - mother's race 0-5=white 6=mex 7=black 8=asian 9=mixed 99=unknown 
10 age - mother's age in years at termination of pregnancy, 99=unknown
11 ed - mother's education 0= less than 8th grade, 
   1 = 8th -12th grade - did not graduate, 
   2= HS graduate--no other schooling , 3= HS+trade,
  4=HS+some college 5= College graduate, 6&7 Trade school HS unclear, 9=unknown 
12 ht - mother's height in inches to the last completed inch
   99=unknown
13 wt - mother prepregnancy wt in pounds, 999=unknown 
14 drace - father's race, coding same as mot