In [1]:
# Programmer: Matt Woodmansee

# Import Dependencies
import pandas as pd


First step was to read the census dataset and the merged school crime / performance table and see if they would connect based on the NTA name.

In [2]:
# Reference the file where the CSV is located
NYC_df = pd.read_csv("../Resources/NYC_Nhood ACS2008_12.csv")


To merge, it was found that the NTA name column for the census data would need to be renamed so the merge would work correctly.

In [3]:
NYC_df.rename(columns={"ntaname": "NTA"}, inplace=True)

In [4]:
NYC_df.columns

Index(['UEMPRATE', 'cartodb_id', 'borocode', 'withssi', 'withsocial',
       'withpubass', 'struggling', 'profession', 'popunemplo', 'poptot',
       'popover18', 'popinlabou', 'poororstru', 'poor', 'pacificune',
       'pacificinl', 'pacific', 'otherunemp', 'otherinlab', 'otherethni',
       'onlyprofes', 'onlymaster', 'onlylessth', 'onlyhighsc', 'onlydoctor',
       'onlycolleg', 'onlybachel', 'okay', 'mixedunemp', 'mixedinlab', 'mixed',
       'master', 'maleunempl', 'maleover18', 'male_pro', 'male_mastr',
       'male_lesHS', 'male_HS', 'male_doctr', 'male_collg', 'male_BA',
       'maleinlabo', 'maledrop', 'male16to19', 'male', 'lessthan10',
       'lessthanhi', 'households', 'hispanicun', 'hispanicin', 'hispanic',
       'highschool', 'field_1', 'femaleunem', 'femaleover', 'fem_profes',
       'fem_master', 'fem_lessHS', 'fem_HS', 'fem_doctor', 'fem_colleg',
       'fem_BA', 'femaleinla', 'femaledrop', 'femal16_19', 'female',
       'europeanun', 'europeanin', 'european', 'doctor

Once the datasets did not merge correctly in the first try, it was found that the safety dataset had trailing spaces in the NTA name.  So we removed the trailing spaces from the NTA name (update performed by Allen).

In [5]:
#Create new dataframe using only the columns needed.
NYC_census_clean = NYC_df[["poptot", "onlylessth", "onlyhighsc", "onlybachel", "households", "NTA", "medianinco"]]
NYC_census_clean.head()

Unnamed: 0,poptot,onlylessth,onlyhighsc,onlybachel,households,NTA,medianinco
0,48351,2734,29374,11362,16432,Georgetown-Marine Park-Bergen Beach-Mill Basin,1520979.0
1,61584,6094,39445,16743,25070,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,1054259.0
2,100130,14357,52523,16187,40818,Crown Heights North,980637.0
3,33155,5964,19097,9312,14888,East Williamsburg,519058.0
4,24199,3630,13179,4001,7749,College Point,354073.0


In [6]:
#To normalize the data, we will need to determine the percentage of the available population for each level of educational attainment.
#Because the educational attainment numbers are only from the population 25 years or older, we need this total number.
NYC_census_clean["Total Pop 25 and over"] = NYC_census_clean["onlylessth"] + NYC_census_clean["onlyhighsc"]
NYC_census_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,poptot,onlylessth,onlyhighsc,onlybachel,households,NTA,medianinco,Total Pop 25 and over
0,48351,2734,29374,11362,16432,Georgetown-Marine Park-Bergen Beach-Mill Basin,1520979.0,32108
1,61584,6094,39445,16743,25070,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,1054259.0,45539
2,100130,14357,52523,16187,40818,Crown Heights North,980637.0,66880
3,33155,5964,19097,9312,14888,East Williamsburg,519058.0,25061
4,24199,3630,13179,4001,7749,College Point,354073.0,16809


In [7]:
#rename columns to make it look nicer.
NYC_census_clean = NYC_census_clean.rename(columns={"poptot":"Total Population", "onlylessth":"Did Not Grad HS", "onlyhighsc":"Grad HS or Higher",
                                "onlybachel":"Bachelor Degree or Higher", "medianinco":"Median Household Income"})
NYC_census_clean.head()

Unnamed: 0,Total Population,Did Not Grad HS,Grad HS or Higher,Bachelor Degree or Higher,households,NTA,Median Household Income,Total Pop 25 and over
0,48351,2734,29374,11362,16432,Georgetown-Marine Park-Bergen Beach-Mill Basin,1520979.0,32108
1,61584,6094,39445,16743,25070,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,1054259.0,45539
2,100130,14357,52523,16187,40818,Crown Heights North,980637.0,66880
3,33155,5964,19097,9312,14888,East Williamsburg,519058.0,25061
4,24199,3630,13179,4001,7749,College Point,354073.0,16809


In [8]:
#generate the dataframe of the clean census data.
NYC_census_clean.to_csv("../Resources/clean_census.csv", index=False)