# Initial Data Cleaning: Income by Zip Data Set

## Data Dictionary

TODO

zip | Zip (ZCTA) |
    |Median household income in the past 12 months (in 2018 inflation-adjusted dollars)|
    

## Import libraries

In [1]:
import numpy as np
import pandas as pd

## Read in data

In [2]:
df = pd.read_csv('../data/raw_income_by_zip_nyc.csv')

In [3]:
df.head()

Unnamed: 0,Zip (ZCTA),Median household income in the past 12 months (in 2018 inflation-adjusted dollars),Per capita income in the past 12 months (in 2018 Inflation-adjusted dollars),Population,Population Density (square miles),Housing Units,Median Value of Owner-occupied Units
0,10001,"$88,526","$84,765",22924,37306.6,14141,"$343,400"
1,10002,"$35,859","$32,694",74993,85369.8,35724,"$665,200"
2,10003,"$112,131","$92,781",54682,94859.6,30876,"$1,001,200"
3,10004,"$157,645","$122,165",3028,5622.9,2139,"$1,640,600"
4,10005,"$173,333","$106,702",8831,121496.4,5585,"$1,415,400"


In [4]:
# Check the shape of the data
df.shape

(177, 7)

In [5]:
# Check data types
df.dtypes

Zip (ZCTA)                                                                             int64
Median household income in the past 12 months (in 2018 inflation-adjusted dollars)    object
Per capita income in the past 12 months (in 2018 Inflation-adjusted dollars)          object
Population                                                                            object
Population Density (square miles)                                                     object
Housing Units                                                                         object
Median Value of Owner-occupied Units                                                  object
dtype: object

In [6]:
# Check nulls
df.isnull().sum()

Zip (ZCTA)                                                                            0
Median household income in the past 12 months (in 2018 inflation-adjusted dollars)    0
Per capita income in the past 12 months (in 2018 Inflation-adjusted dollars)          0
Population                                                                            0
Population Density (square miles)                                                     0
Housing Units                                                                         0
Median Value of Owner-occupied Units                                                  0
dtype: int64

## Initial Data Cleaning

### Change column names 

In [7]:
# Create column dictionary
col_dict = {
    'Zip (ZCTA)' : 'zipcode',
    'Median household income in the past 12 months (in 2018 inflation-adjusted dollars)' : 'median_household_income',
    'Per capita income in the past 12 months (in 2018 Inflation-adjusted dollars)' : 'per_capita_income',
    'Population' : 'population',
    'Population Density (square miles) ' : 'population_density_square_miles',
    'Housing Units' : 'housing_units',
    'Median Value of Owner-occupied Units' : 'median_home_value'
}

In [8]:
# Change column names 
df = df.rename(columns=col_dict)

In [9]:
df.head()

Unnamed: 0,zipcode,median_household_income,per_capita_income,population,population_density_square_miles,housing_units,median_home_value
0,10001,"$88,526","$84,765",22924,37306.6,14141,"$343,400"
1,10002,"$35,859","$32,694",74993,85369.8,35724,"$665,200"
2,10003,"$112,131","$92,781",54682,94859.6,30876,"$1,001,200"
3,10004,"$157,645","$122,165",3028,5622.9,2139,"$1,640,600"
4,10005,"$173,333","$106,702",8831,121496.4,5585,"$1,415,400"


### Change datatypes

Change `object` to `float` or `int`

In [10]:
col_list_float = [
    'median_household_income', 'per_capita_income',
    'population_density_square_miles', 'median_home_value'
]

In [11]:
col_list_int = ['population', 'housing_units']

In [12]:
# Define fuction to change data types to float
def change_dtypes_float(df, col_list):
    for col in col_list:
        df[col] = df[col].replace('[\$,]', '', regex=True).astype(float)
    return df

In [13]:
# Define function to change data types to int
def change_dtypes_int(df, col_list):
    for col in col_list:
        df[col] = df[col].replace('[\,]', '', regex=True).astype(int)
    return df

In [14]:
df = change_dtypes_float(df, col_list_float)

In [15]:
df = change_dtypes_int(df, col_list_int)

In [16]:
# Check data types again
df.dtypes

zipcode                              int64
median_household_income            float64
per_capita_income                  float64
population                           int64
population_density_square_miles    float64
housing_units                        int64
median_home_value                  float64
dtype: object

## Create Target Column 'home_price_to_income_ratios'

Home prices are the median sale price of existing homes and incomes are the median household income within markets [(Reference)](https://www.jchs.harvard.edu/home-price-income-ratios). The unit is in (human) year.

In [17]:
df['home_price_to_income_ratios'] = df['median_home_value'] / df['median_household_income']

In [18]:
# ## Drop 'median_home_value' and 'median_household_income'
# df.drop(columns=['median_home_value', 'median_household_income'], inplace=True)

## Export clean dataset as csv

In [19]:
df.to_csv('../data/clean_income_by_zip_nyc.csv', index=False)