# Data Exploration

Goals: 
1. Distill the data from American Fact finder to a tidy GeoJSON.

In [1]:
import pandas as pd
import geopandas as gpd
import json

In [2]:
df = pd.read_csv("../data/interim/ca_educ_by_county.csv")

In [3]:
df.head()

Unnamed: 0,countyfips,No schooling completed,"Nursery school, preschool",Kindergarten,Grade 1,Grade 2,Grade 3,Grade 4,Grade 5,Grade 6,...,"12th grade, no diploma",Regular high school diploma,GED or alternative credential,"Some college, but less than 1 year","1 or more years of college credit, no degree","Associate's degree, type not specified",Bachelor's degree,Master's degree,Professional degree beyond a bachelor's degree,Doctoral degree
0,0,2432.0,0,0,34,218,1517.0,193,272,6776.0,...,3837.0,45842.0,6139.0,11214.0,35140.0,16434.0,26705.0,5550.0,1088.0,234.0
1,1,3124.0,0,57,0,0,0.0,565,219,2462.0,...,5373.0,37447.0,3968.0,12313.0,38387.0,13681.0,100160.0,39661.0,6444.0,6436.0
2,7,201.0,0,0,0,0,0.0,0,0,0.0,...,550.0,5240.0,1009.0,2841.0,8947.0,2889.0,5289.0,795.0,579.0,0.0
3,13,1669.0,0,0,0,152,0.0,0,0,982.0,...,4385.0,18731.0,3930.0,10545.0,28716.0,16107.0,36899.0,11680.0,2341.0,1207.0
4,17,0.0,0,0,0,0,0.0,0,0,0.0,...,425.0,3550.0,104.0,972.0,2845.0,2575.0,5796.0,1043.0,170.0,560.0


In [4]:
data = df.melt(id_vars='countyfips', var_name = 'education', value_name = 'count')

In [5]:
data['count'] = data['count'].str.replace(",","").astype(float)

In [6]:
data.head()

Unnamed: 0,countyfips,education,count
0,0,No schooling completed,2432.0
1,1,No schooling completed,3124.0
2,7,No schooling completed,201.0
3,13,No schooling completed,1669.0
4,17,No schooling completed,0.0


In [7]:
data['education'].unique()

array(['No schooling completed', 'Nursery school, preschool',
       'Kindergarten', 'Grade 1', 'Grade 2', 'Grade 3', 'Grade 4',
       'Grade 5', 'Grade 6', 'Grade 7', 'Grade 8', 'Grade 9', 'Grade 10',
       'Grade 11', '12th grade, no diploma',
       'Regular high school diploma', 'GED or alternative credential',
       'Some college, but less than 1 year',
       '1 or more years of college credit, no degree',
       "Associate's degree, type not specified", "Bachelor's degree",
       "Master's degree",
       "Professional degree beyond a bachelor's degree",
       'Doctoral degree'], dtype=object)

In [8]:
education_values = [
    'No schooling completed', 
    'Nursery school, preschool',
    'Kindergarten',
    'Grade 1', 'Grade 2', 'Grade 3', 'Grade 4',
    'Grade 5', 'Grade 6', 'Grade 7', 'Grade 8', 
    'Grade 9', 'Grade 10', 'Grade 11', '12th grade, no diploma',
    'Regular high school diploma', 'GED or alternative credential',
    'Some college, but less than 1 year',
    '1 or more years of college credit, no degree',
    "Associate's degree, type not specified", "Bachelor's degree",
    "Master's degree",
    "Professional degree beyond a bachelor's degree",
    'Doctoral degree'
]

In [9]:
data['education'] = pd.Categorical(data['education'], categories = education_values, ordered = True,)

In [10]:
with open('../data/interim/ca_ba.geojson', 'r') as file:
    ca_data = json.load(file)

In [11]:
df = gpd.read_file('../data/interim/county_data/cb_2017_us_county_500k.shp')
df = df[df['STATEFP'] == '06']

In [12]:
data.columns = ['COUNTYFP', 'educd', 'count']

data['COUNTYFP'] = data['COUNTYFP'].astype(str).str.rjust(3, "0")

data.head()

Unnamed: 0,COUNTYFP,educd,count
0,0,No schooling completed,2432.0
1,1,No schooling completed,3124.0
2,7,No schooling completed,201.0
3,13,No schooling completed,1669.0
4,17,No schooling completed,0.0


In [13]:
df['centroid_lon'] = df['geometry'].centroid.x
df['centroid_lat'] = df['geometry'].centroid.y

df.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry,centroid_lon,centroid_lat
18,6,1,1675839,0500000US06001,6001,Alameda,6,1909616630,216916717,"POLYGON ((-122.342253 37.805558, -122.33411840...",-121.88887,37.646895
19,6,5,1675841,0500000US06005,6005,Amador,6,1539933576,29470568,"POLYGON ((-121.027406 38.50354, -121.027472 38...",-120.65109,38.446392
20,6,13,1675903,0500000US06013,6013,Contra Costa,6,1857310903,225193562,"POLYGON ((-122.42976 37.965405, -122.418592 37...",-121.927786,37.919123
21,6,23,1681908,0500000US06023,6023,Humboldt,6,9241251740,1254039383,"POLYGON ((-124.408601 40.44320099999999, -124....",-123.875629,40.699297
22,6,37,277283,0500000US06037,6037,Los Angeles,6,10510588451,1794793532,"(POLYGON ((-118.604415 33.478552, -118.598783 ...",-118.224817,34.320751


In [14]:
for val in data['educd'].unique():
    print(val)

No schooling completed
Nursery school, preschool
Kindergarten
Grade 1
Grade 2
Grade 3
Grade 4
Grade 5
Grade 6
Grade 7
Grade 8
Grade 9
Grade 10
Grade 11
12th grade, no diploma
Regular high school diploma
GED or alternative credential
Some college, but less than 1 year
1 or more years of college credit, no degree
Associate's degree, type not specified
Bachelor's degree
Master's degree
Professional degree beyond a bachelor's degree
Doctoral degree


In [15]:
data.loc[data['educd'] < 'Regular high school diploma', 'education_level'] = 'No HS'

data.loc[((data['educd'] == 'Regular high school diploma') | (data['educd'] == 'GED or alternative credential')), 'education_level'] = 'HS'

data.loc[(data['educd'] >= 'Some college, but less than 1 year') & (data['educd'] <= "Associate's degree, type not specified"), 'education_level'] = 'Some college/AA'

data.loc[data['educd'] >= "Bachelor's degree", 'education_level'] = 'B.A.+'

#data.loc[data['educd'] > "Bachelor's degree", 'education_level'] = 'Advanced degree'

In [16]:
data['education_level'].isnull().sum()

0

In [17]:
educ_by_county = data.groupby(['COUNTYFP', 'education_level'])[['count']].sum()

shares = educ_by_county.groupby(level=0).apply(lambda x: x / float(x.sum()))

ba_shares = shares.xs('B.A.+', level = 1)
ba_shares.columns = ['Share of BA']
ba_shares.reset_index(inplace = True)

In [18]:
df = df[['STATEFP', 'COUNTYFP', 'GEOID', 'NAME', 'centroid_lon', 'centroid_lat', 'geometry']]

geo_data = df.merge(ba_shares, on  ='COUNTYFP', how = 'left')

geo_data.head()

geo_data['value_text'] = (geo_data['Share of BA'].fillna(0) * 100).astype(str).str.split(".").str[0].str.replace("0", "") + "%"
geo_data.loc[geo_data['value_text'] == '%', 'value_text'] = ""
# geo_data['value_text'] = geo_data['NAME'] + ", " + geo_data['value_text']

In [19]:
ca_data = json.loads(geo_data.to_json())

with open("../data/interim/ACS_ca_BAs2012-2016.geojson", "w") as file:
    json.dump(ca_data, file, indent=4)