In [223]:
import pandas as pd
import numpy as np

# Customize data folder
data_folder = "C:/Users/Alex H/OneDrive/Documents/BU Fall 2023/DS701/ds-livable-streets-infra/fa23-team/"

# Read demographic and property value data
combined_demo_prop = pd.read_csv(data_folder + "combined_demo_prop.csv", index_col=False)
combined_demo_prop.shape

(564, 16)

In [224]:
combined_demo_prop.tract.value_counts()

801.00     12
1002.00    12
917.00     12
918.00     12
919.00     12
920.00     12
921.01     12
922.00     12
923.00     12
924.00     12
1001.00    12
1003.00    12
915.00     12
1004.00    12
1005.00    12
1006.01    12
1006.03    12
1007.00    12
1008.00    12
1101.03    12
1201.04    12
1202.01    12
916.00     12
914.00     12
803.00     12
821.00     12
805.00     12
806.01     12
813.00     12
814.00     12
815.00     12
817.00     12
818.00     12
819.00     12
820.00     12
901.00     12
913.00     12
902.00     12
903.00     12
904.00     12
906.00     12
907.00     12
909.01     12
910.01     12
911.00     12
912.00     12
804.01     12
Name: tract, dtype: int64

In [255]:
# Read income and home ownership data
combined_income_owner = pd.read_csv(data_folder+"income_houseownership.csv", index_col=False)
combined_income_owner.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606 entries, 0 to 605
Data columns (total 17 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   year                                         606 non-null    int64  
 1   Median income (dollars)                      574 non-null    float64
 2   Mean income (dollars)                        577 non-null    float64
 3   median_income_difference                     524 non-null    float64
 4   mean_income_difference                       528 non-null    float64
 5   census_tract                                 606 non-null    float64
 6   Total population in occupied housing units:  584 non-null    float64
 7   Owner_occupied                               584 non-null    float64
 8   Owner_occupied_Moved in 1999 or earlier      584 non-null    float64
 9   Renter_occupied                              584 non-null    float64
 10  Re

In [256]:
# Drop diff columns
combined_income_owner.drop(["median_income_difference", "mean_income_difference"], axis=1,
                           inplace=True)

# Rename columns
combined_income_owner.columns = ["year", "median_income", "mean_income", "tract", "housed_population", "owner_occupied", "owner_occupied_pre-2000", "renter_occupied", "renter_occupied_pre-2000", "percent_owners", "percent_renters", "percent_owners_pre-2000", "percent_renters_pre-2000", "renters_owners_ratio", "renters_owners_pre-2000_ratio"]

# Reorder columns
leading_cols = ['tract', 'year']
combined_income_owner = combined_income_owner[leading_cols + [col for col in combined_income_owner.columns if col not in leading_cols]]

In [257]:
print(combined_income_owner.shape)

(606, 15)


In [258]:
# Drop redundant tracts
drop_tracts = [708.00, 708.01, 709.00, 709.01, 813.00, 1103.03]

combined_income_owner = combined_income_owner[~combined_income_owner['tract'].isin(drop_tracts)]
combined_income_owner.tract.nunique()

47

In [259]:
# Revert to 2010 tract names
combined_income_owner = combined_income_owner.replace(813.02, 813.00)
combined_income_owner = combined_income_owner.replace(1101.05, 1101.03)

combined_income_owner.tract.value_counts()

901.00     12
814.00     12
1006.03    12
1007.00    12
1008.00    12
801.00     12
803.00     12
804.01     12
805.00     12
806.01     12
813.00     12
815.00     12
1005.00    12
817.00     12
818.00     12
819.00     12
820.00     12
821.00     12
904.00     12
906.00     12
1101.03    12
1201.04    12
1006.01    12
1004.00    12
902.00     12
916.00     12
903.00     12
907.00     12
909.01     12
910.01     12
911.00     12
912.00     12
913.00     12
914.00     12
915.00     12
917.00     12
1003.00    12
918.00     12
919.00     12
920.00     12
921.01     12
922.00     12
923.00     12
924.00     12
1001.00    12
1002.00    12
1202.01    12
Name: tract, dtype: int64

In [260]:
# Make tract a string
combined_income_owner.tract = combined_income_owner.tract.astype(str)
combined_demo_prop.tract = combined_demo_prop.tract.astype(str)

In [261]:
# Combine Census Data
census = pd.merge(combined_demo_prop, combined_income_owner, on=["tract", "year"])
census = census.sort_values(['tract', 'year'])
census.shape

(564, 29)

In [262]:
# Remove trailing ".0" in string tracts
census.tract = census.tract.str.replace(r'\.0$', '', regex=True)

In [263]:
# Generate lists of tracts in neighborhoods for later labeling
dorchester = ["901", "902", "903", "904", "906", "907", "909.01",
              "910.01", "911", "912", "913", "914", "915", "916",
              "917", "918", "919", "920", "921.01", "922", "923",
              "924", "1001", "1002", "1003", "1004", "1005", "1006.01",
              "1006.03", "1007", "1008"]
roxbury = ["801", "803", "804.01", "805", "806.01", "813", "814",
           "815", "817", "818", "819", "820", "821"]
jp = ["1201.04", "1202.01", "1101.03"]

In [264]:
# # Check successful merge
# census.to_csv(data_folder+"census_test.csv", index=False)

In [265]:
# Clean bike data
bike_data = pd.read_csv(data_folder + "analysis/roxbury_dorchester_bike_data.csv", index_col=False)
bike_data = bike_data.drop("Unnamed: 0", axis=1)
bike_data.tract.value_counts()

1001.00    15
821.00     15
902.00     15
903.00     15
904.00     15
907.00     15
909.01     15
910.01     15
911.00     15
912.00     15
913.00     15
914.00     15
915.00     15
916.00     15
917.00     15
918.00     15
919.00     15
920.00     15
921.01     15
922.00     15
923.00     15
901.00     15
820.00     15
1002.00    15
819.00     15
1003.00    15
1004.00    15
1005.00    15
1006.01    15
1006.03    15
1007.00    15
1008.00    15
1201.04    15
1202.01    15
709.00     15
801.00     15
803.00     15
805.00     15
806.01     15
813.00     15
814.00     15
815.00     15
817.00     15
818.00     15
924.00     15
Name: tract, dtype: int64

In [266]:
bike_data.tract = bike_data.tract.astype(str)
bike_data.tract = bike_data.tract.str.replace(r'\.0$', '', regex=True)

In [273]:
# ID tracts in bike_data that are not in census
rogue_tracts = [i for i in bike_data.tract.unique() if i not in census.tract.unique()]
rogue_tracts

[]

In [268]:
# Remove rogue tracts
bike_data = bike_data[~bike_data['tract'].isin(rogue_tracts)]

In [269]:
bike_data.shape

(660, 11)

In [270]:
# Merge census and bike data
census_bike = pd.merge(census, bike_data, on=['tract', 'year'], how="outer")

census_bike.shape

(696, 38)

In [271]:
# # Check successful merge
# census_bike.to_csv(data_folder+"census_bike_test.csv", index=False)

In [274]:
census_bike.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 696 entries, 0 to 695
Data columns (total 38 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   tract                          696 non-null    object 
 1   year                           696 non-null    int64  
 2   total_population               564 non-null    float64
 3   male                           564 non-null    float64
 4   female                         564 non-null    float64
 5   median_age                     564 non-null    float64
 6   race_white                     564 non-null    float64
 7   race_black                     564 non-null    float64
 8   race_ai_alaskan                564 non-null    float64
 9   race_asian                     564 non-null    float64
 10  race_hawaiian                  564 non-null    float64
 11  race_other                     564 non-null    float64
 12  race_hispanic                  564 non-null    flo

In [275]:
# Set neighborhood labels
def set_neighborhood(tract):
    if tract in dorchester:
        return "Dorchester"
    elif tract in roxbury:
        return "Roxbury"
    elif tract in jp:
        return "Jamaica Plain"
    else:
        return ''

census_bike['neighborhood'] = census_bike['tract'].apply(set_neighborhood)

In [277]:
# Reorder columns
leading_cols = ['tract', 'year', 'neighborhood']
census_bike = census_bike[leading_cols + [col for col in census_bike.columns if col not in leading_cols]]

In [278]:
# Save combined file
census_bike.to_csv(data_folder + "census_bike_merged.csv", index=False)