In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
%matplotlib inline

In [2]:
df = pd.read_stata('usa_00021.dta')

In [3]:
df.shape

(211012, 16)

In [4]:
df.head()

Unnamed: 0,year,datanum,serial,hhwt,statefip,gq,pernum,perwt,sex,age,bpl,bpld,citizen,yrsusa2,educ,educd
0,1980,1,192105,20,california,households under 1970 definition,1,20,male,47,vietnam,vietnam,not a citizen,0-5 years,2 years of college,2 years of college
1,1980,1,192105,20,california,households under 1970 definition,2,20,female,42,vietnam,vietnam,not a citizen,0-5 years,grade 10,grade 10
2,1980,1,192105,20,california,households under 1970 definition,3,20,female,18,vietnam,vietnam,not a citizen,0-5 years,grade 12,grade 12
3,1980,1,192264,20,california,households under 1970 definition,1,20,male,33,south america,venezuela,not a citizen,0-5 years,4 years of college,4 years of college
4,1980,1,192264,20,california,households under 1970 definition,2,20,female,32,south america,venezuela,not a citizen,0-5 years,grade 12,"some college, but less than 1 year"


In [10]:
df[df['year'] == '1980']['educd'].unique()

[2 years of college, grade 10, grade 12, 4 years of college, some college, but less than 1 year, ..., 8+ years of college, grade 7, 7 years of college, kindergarten, nursery school, preschool]
Length: 24
Categories (24, object): [no schooling completed < nursery school, preschool < kindergarten < grade 1 ... 5+ years of college < 6 years of college (6+ in 1960-1970) < 7 years of college < 8+ years of college]

In [15]:
df['educd'].cat.categories

Index(['no schooling completed', 'nursery school to grade 4',
       'nursery school, preschool', 'kindergarten', 'grade 1, 2, 3, or 4',
       'grade 1', 'grade 2', 'grade 3', 'grade 4', 'grade 5, 6, 7, or 8',
       'grade 5 or 6', 'grade 5', 'grade 6', 'grade 7 or 8', 'grade 7',
       'grade 8', 'grade 9', 'grade 10', 'grade 11', 'grade 12',
       '12th grade, no diploma', 'high school graduate or ged',
       'regular high school diploma', 'ged or alternative credential',
       'some college, but less than 1 year', '1 year of college',
       '1 or more years of college credit, no degree', '2 years of college',
       'associate's degree, type not specified',
       'associate's degree, occupational program',
       'associate's degree, academic program', '3 years of college',
       '4 years of college', 'bachelor's degree', '5+ years of college',
       '6 years of college (6+ in 1960-1970)', '7 years of college',
       '8+ years of college', 'master's degree',
       'profes

In [16]:
df[df['educd'] >= '4 years of college']

Unnamed: 0,year,datanum,serial,hhwt,statefip,gq,pernum,perwt,sex,age,bpl,bpld,citizen,yrsusa2,educ,educd
3,1980,1,192264,20,california,households under 1970 definition,1,20,male,33,south america,venezuela,not a citizen,0-5 years,4 years of college,4 years of college
7,1980,1,192542,20,california,households under 1970 definition,1,20,male,30,south america,venezuela,not a citizen,0-5 years,4 years of college,4 years of college
8,1980,1,192542,20,california,households under 1970 definition,2,20,female,27,south america,venezuela,not a citizen,0-5 years,4 years of college,4 years of college
21,1980,1,193832,20,california,households under 1970 definition,2,20,male,25,saudi arabia,saudi arabia,not a citizen,0-5 years,5+ years of college,5+ years of college
35,1980,1,195093,20,california,households under 1970 definition,1,20,male,33,canada,canada,not a citizen,0-5 years,5+ years of college,6 years of college (6+ in 1960-1970)
36,1980,1,195111,20,california,households under 1970 definition,1,20,male,49,china,china,not a citizen,0-5 years,4 years of college,4 years of college
38,1980,1,195337,20,california,households under 1970 definition,1,20,male,50,philippines,philippines,not a citizen,0-5 years,4 years of college,4 years of college
39,1980,1,195337,20,california,households under 1970 definition,2,20,female,43,philippines,philippines,not a citizen,0-5 years,4 years of college,4 years of college
68,1980,1,196453,20,california,households under 1970 definition,2,20,male,38,syria,syria,not a citizen,0-5 years,5+ years of college,6 years of college (6+ in 1960-1970)
72,1980,1,196860,20,california,households under 1970 definition,2,20,male,35,india,india,not a citizen,0-5 years,5+ years of college,5+ years of college


In [36]:
df['agg educd 0'] = np.where(df['educd'] >= '4 years of college', 'BA or More', np.nan)
df['agg educd 1'] = np.where(((df['educd'] >= 'high school graduate or ged') & (df['educd'] <= 'ged or alternative credential')), 'HS', df['agg educd 0'])
df['agg educd 2'] = np.where(df['educd'] < 'high school graduate or ged', 'Less than HS', df['agg educd 1'])
df['agg educd 3'] = np.where(((df['educd'] > 'ged or alternative credential') & (df['educd'] < '4 years of college')), 'Some College', df['agg educd 2']) 

df['agg educd'] = df['agg educd 3'].copy()

df.drop(columns = ['agg educd 0', 'agg educd 1', 'agg educd 2', 'agg educd 3'], inplace = True)

In [38]:
immig_educ = df.groupby(['year', 'bpld', 'agg educd',])['perwt'].sum().to_frame()
immig = df.groupby(['year', 'bpld',])['perwt'].sum().to_frame()

In [64]:
immig.xs('2016', level = 0).sort_values(by = 'perwt', ascending = False, ).head()

Unnamed: 0_level_0,perwt
bpld,Unnamed: 1_level_1
china,146135.0
mexico,145667.0
india,117132.0
philippines,60819.0
vietnam,41446.0


In [102]:
data = immig_educ.reset_index()
#data = data[data['bpld'] == 'china']

brush = alt.selection(type='interval', encodings=['x'])

upper1 = alt.Chart(data[data['bpld'] == 'china']).mark_line().encode(
    x = alt.X('year:T', timeUnit='year', scale={'domain': brush.ref()}),
    y = 'perwt',
    color = 'agg educd',
).properties(
    title = 'China',
    width = 600,
    height = 400,
)

lower1 = upper1.properties(
    selection=brush,
    height=60
)


upper2 = alt.Chart(data[data['bpld'] == 'mexico']).mark_line().encode(
    x = alt.X('year:T', timeUnit='year', scale={'domain': brush.ref()}),
    y = 'perwt',
    color = 'agg educd',
).properties(
    title = u'México',
    width = 600,
    height = 400,
)

lower2 = upper2.properties(
    selection=brush,
    height=60
)



# chart2 = alt.Chart(data[data['bpld'] == 'mexico']).mark_line().encode(
#     alt.X('year:T', timeUnit='year'),
#     y = 'perwt',
#     color = 'agg educd',
# ).properties(
#     width = 600,
#     height = 400,
# )

alt.hconcat(alt.vconcat(upper1, lower1), alt.vconcat(upper2, lower2))

<VegaLite 2 object>

In [115]:
immig_educ = df.groupby(['year', 'bpld', 'agg educd',])['perwt'].sum().to_frame()
data = immig_educ.reset_index()
def make_altair_chart(country):
    brush = alt.selection(type='interval', encodings=['x'])

    upper1 = alt.Chart(data[data['bpld'] == country]).mark_line().encode(
        x = alt.X('year:T', timeUnit='year', scale={'domain': brush.ref()}),
        y = 'perwt:Q',
        color = 'agg educd',
        order = alt.Order('agg educd:O'),
    ).properties(
        title = f'{country}'.capitalize(),
        width = 600,
        height = 400,
    )

    lower1 = upper1.properties(
        selection=brush,
        height=60
    )
    chart = alt.vconcat(upper1, lower1)
    return chart

In [125]:
interact(make_altair_chart, country = data['bpld'].unique())

interactive(children=(Dropdown(description='country', options=('canada', 'st. pierre and miquelon', 'bermuda',…

<function __main__.make_altair_chart>