In [152]:
import pandas as pd
import numpy as np

# Customize data folder
data_folder = "C:/Users/Alex H/OneDrive/Documents/BU Fall 2023/DS701/ds-livable-streets-infra/fa23-team/"

# Read demographic and property value data
combined_demo_prop = pd.read_csv(data_folder + "combined_add_jp.csv", index_col=False)
combined_demo_prop.shape

(578, 16)

In [153]:
# Replace tract 813.01 with 804.01 in demographic and property value data
combined_demo_prop = combined_demo_prop[combined_demo_prop.tract != 813.01]
print(combined_demo_prop.tract.nunique())

ct804 = pd.read_csv(data_folder + "demo_804-01.csv", index_col=False)
combined_demo_prop = pd.concat([combined_demo_prop, ct804], ignore_index=True)
print(combined_demo_prop.tract.nunique())

52
53


In [154]:
# Read income and home ownership data
combined_income_owner = pd.read_csv(data_folder+"income_houseownership.csv", index_col=False)
combined_income_owner.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 654 entries, 0 to 653
Data columns (total 18 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Unnamed: 0                                   654 non-null    int64  
 1   year                                         654 non-null    int64  
 2   Median income (dollars)                      574 non-null    float64
 3   Mean income (dollars)                        577 non-null    float64
 4   median_income_difference                     524 non-null    float64
 5   mean_income_difference                       528 non-null    float64
 6   census_tract                                 654 non-null    float64
 7   Total population in occupied housing units:  584 non-null    float64
 8   Owner_occupied                               584 non-null    float64
 9   Owner_occupied_Moved in 1999 or earlier      584 non-null    float64
 10  Re

In [155]:
# Drop unnecessary index column
combined_income_owner = combined_income_owner.drop("Unnamed: 0", axis=1)

# Rename columns
combined_income_owner.columns = ["year", "median_income", "mean_income", "median_income_difference", "mean_income_difference", "tract", "housed_population", "owner_occupied", "owner_occupied_pre-2000", "renter_occupied", "renter_occupied_pre-2000", "percent_owners", "percent_renters", "percent_owners_pre-2000", "percent_renters_pre-2000", "renters_owners_ratio", "renters_owners_pre-2000_ratio"]

In [178]:
print(combined_demo_prop.shape)
print(combined_income_owner.shape)

(588, 16)
(654, 17)


In [156]:
# Make tract a string
convert = {'tract': str}

combined_demo_prop = combined_demo_prop.astype(convert)
combined_income_owner = combined_income_owner.astype(convert)

In [157]:
# Combine Census Data
census = pd.merge(combined_demo_prop, combined_income_owner, on=["tract", "year"], how="outer")
census = census.sort_values(['tract', 'year'])
census.shape

(654, 31)

In [158]:
# Remove trailing ".0" in string tracts
census.tract = census.tract.str.replace(r'\.0$', '', regex=True)

In [159]:
# Generate lists of tracts in neighborhoods for later labeling
dorchester = ["901", "902", "903", "904", "906", "907", "909.01",
              "910.01", "911", "912", "913", "914", "915", "916",
              "917", "918", "919", "920", "921.01", "922", "923",
              "924", "1001", "1002", "1003", "1004", "1005", "1006.01",
              "1006.03", "1007", "1008"]
roxbury = ["708", "708.01", "709", "709.01", "801", "803", "804.01",
           "805", "806.01", "813", "813.02", "814", "815", "817", "818",
           "819", "820", "821"]
jp = ["1201.04", "1202.01", "1101.05", "1101.03"]

In [179]:
len(dorchester) + len(roxbury) + len(jp)

53

In [160]:
# Clean bike data
bike_data = pd.read_csv(data_folder + "analysis/tracts_bike_data.csv", index_col=False)
bike_data = bike_data.drop("Unnamed: 0", axis=1)
bike_data = bike_data.astype(convert)

In [161]:
# ID tracts in bike_data that are not in census
rogue_tracts = [i for i in bike_data.tract.unique() if i not in combined_demo_prop.tract.unique()]
rogue_tracts

['1203.01',
 '711.01',
 '9803.0',
 '9811.0',
 '1009.0',
 '1011.01',
 '1011.02',
 '611.01',
 '9812.01']

In [162]:
bike_data = bike_data[~bike_data['tract'].isin(rogue_tracts)]
bike_data.tract.nunique()

46

In [163]:
# Merge census and bike data
census_bike = pd.merge(census, bike_data, on=['tract', 'year'], how="outer")

census_bike.shape

(753, 40)

In [164]:
census_bike

Unnamed: 0,tract,year,total_population,male,female,median_age,race_white,race_black,race_ai_alaskan,race_asian,...,renters_owners_pre-2000_ratio,BFBL,BL,BLSL,SBL,SLM,SUP,total_built_all_lanes,cumulative_length,SLMTC
0,1001,2010,5617.0,0.481,0.519,23.9,0.063,0.638,0.012,0.018,...,0.700662,,,,,,,,,
1,1001,2011,5657.0,0.480,0.520,24.5,0.109,0.619,0.014,0.003,...,0.774003,,,,,,,,,
2,1001,2012,6189.0,0.451,0.549,25.2,0.074,0.701,0.004,0.015,...,1.260708,,,,,,,,,
3,1001,2013,6295.0,0.433,0.567,27.3,0.112,0.727,0.001,0.011,...,1.236902,,,,,,,,,
4,1001,2014,6371.0,0.455,0.545,31.2,0.131,0.750,0.001,0.018,...,0.946850,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
748,923.0,2017,,,,,,,,,...,,0.0,0.000000,0.0,0.000000,86.543282,0.0,86.543282,681.940001,0.000000
749,923.0,2020,,,,,,,,,...,,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,681.940001,573.238812
750,924.0,2010,,,,,,,,,...,,0.0,589.598126,0.0,0.000000,164.430161,0.0,754.028287,1273.177398,0.000000
751,924.0,2012,,,,,,,,,...,,0.0,440.510678,0.0,0.000000,998.822231,0.0,1439.332909,2712.510306,0.000000


In [165]:
census_bike.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 753 entries, 0 to 752
Data columns (total 40 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   tract                          753 non-null    object 
 1   year                           753 non-null    int64  
 2   total_population               614 non-null    float64
 3   male                           614 non-null    float64
 4   female                         614 non-null    float64
 5   median_age                     614 non-null    float64
 6   race_white                     614 non-null    float64
 7   race_black                     614 non-null    float64
 8   race_ai_alaskan                614 non-null    float64
 9   race_asian                     614 non-null    float64
 10  race_hawaiian                  614 non-null    float64
 11  race_other                     614 non-null    float64
 12  race_hispanic                  614 non-null    flo

In [173]:
# Set neighborhood labels
def set_neighborhood(tract):
    if tract in dorchester:
        return "Dorchester"
    elif tract in roxbury:
        return "Roxbury"
    elif tract in jp:
        return "Jamaica Plain"
    else:
        return ''

census_bike['neighborhood'] = census_bike['tract'].apply(set_neighborhood)

In [175]:
# Save combined file
census_bike.to_csv(data_folder + "census_bike_merged.csv", index=False)