# Livable Streets: Bike Infrastructure & Displacement
## Project Deliverable 1

#### Siddharth Bookinkere, Alexander Heger, Kwan Wing Tuet, Raviv Zait

## Clone repository

In [22]:
!git clone https://github.com/BU-Spark/ds-livable-streets-infra.git

fatal: destination path 'ds-livable-streets-infra' already exists and is not an empty directory.


## Collect and Pre-process Census and Bike Data

### Read data

In [23]:
import pandas as pd
import numpy as np
import os

# Set the current working directory to the root of the repository
os.chdir("C:/Users/Alex H/OneDrive/Documents/BU Fall 2023/DS701/ds-livable-streets-infra/")

# Customize data folder
data_folder = "fa23-team/data/"

# Read demographic and property value data
combined_demo_prop = pd.read_csv(data_folder + "combined_demo_prop.csv", index_col=False)
print("Demographic and property value data shape: ", combined_demo_prop.shape)

# Read income and house ownership data
combined_income_owner = pd.read_csv(data_folder+"income_houseownership.csv", index_col=False)
print("Income and house ownership data shape: ",combined_income_owner.shape)

# Read bike infrastructure data
bike_data = pd.read_csv(data_folder + "roxbury_dorchester_bike_data.csv", index_col=False)
print("Bike infrastructure data shape: ", bike_data.shape)

Demographic and property value data shape:  (564, 16)
Income and house ownership data shape:  (606, 17)
Bike infrastructure data shape:  (675, 12)


### Clean Income and House Ownership Data

In [24]:
# Drop diff columns
combined_income_owner.drop(["median_income_difference", "mean_income_difference"], axis=1,
                           inplace=True)

# Rename columns
combined_income_owner.columns = ["year", "median_income", "mean_income", "tract", "housed_population", "owner_occupied", "owner_occupied_pre-2000", "renter_occupied", "renter_occupied_pre-2000", "percent_owners", "percent_renters", "percent_owners_pre-2000", "percent_renters_pre-2000", "owners_renters_ratio", "renters_owners_pre-2000_ratio"]

# Reorder columns
leading_cols = ['tract', 'year']
combined_income_owner = combined_income_owner[leading_cols + [col for col in combined_income_owner.columns if col not in leading_cols]]

print(combined_income_owner.shape)

(606, 15)


In [25]:
# Drop redundant tracts
drop_tracts = [708.00, 708.01, 709.00, 709.01, 813.00, 1103.03]

combined_income_owner = combined_income_owner[~combined_income_owner['tract'].isin(drop_tracts)]
combined_income_owner.tract.nunique()

47

In [26]:
# Revert to 2010 tract names
combined_income_owner = combined_income_owner.replace(813.02, 813.00)
combined_income_owner = combined_income_owner.replace(1101.05, 1101.03)

### Clean Bike Data

In [27]:
# Remove index column
bike_data = bike_data.drop("Unnamed: 0", axis=1)

# Make tract a string
bike_data.tract = bike_data.tract.astype(str)
bike_data.tract = bike_data.tract.str.replace(r'\.0$', '', regex=True)

### Merge Census Data

In [28]:
# Make tract a string
combined_income_owner.tract = combined_income_owner.tract.astype(str)
combined_demo_prop.tract = combined_demo_prop.tract.astype(str)

In [29]:
# Combine Census Data
census = pd.merge(combined_demo_prop, combined_income_owner, on=["tract", "year"])
census = census.sort_values(['tract', 'year'])
census.shape

(564, 29)

In [30]:
# Remove trailing ".0" in string tracts
census.tract = census.tract.str.replace(r'\.0$', '', regex=True)

In [31]:
# Generate lists of tracts in neighborhoods for later labeling
dorchester = ["901", "902", "903", "904", "906", "907", "909.01",
              "910.01", "911", "912", "913", "914", "915", "916",
              "917", "918", "919", "920", "921.01", "922", "923",
              "924", "1001", "1002", "1003", "1004", "1005", "1006.01",
              "1006.03", "1007", "1008"]
roxbury = ["801", "803", "804.01", "805", "806.01", "813", "814",
           "815", "817", "818", "819", "820", "821"]
jp = ["1201.04", "1202.01", "1101.03"]

### Merge Census and Bike Infrastructure Data

In [32]:
# ID tracts in bike_data that are not in census
rogue_tracts = [i for i in bike_data.tract.unique() if i not in census.tract.unique()]
rogue_tracts

['709']

In [33]:
# Remove rogue tracts
bike_data = bike_data[~bike_data['tract'].isin(rogue_tracts)]
bike_data.shape

(660, 11)

In [34]:
# Merge census and bike data
census_bike = pd.merge(census, bike_data, on=['tract', 'year'], how="outer")

census_bike.shape

(696, 38)

In [35]:
census_bike.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 696 entries, 0 to 695
Data columns (total 38 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   tract                          696 non-null    object 
 1   year                           696 non-null    int64  
 2   total_population               564 non-null    float64
 3   male                           564 non-null    float64
 4   female                         564 non-null    float64
 5   median_age                     564 non-null    float64
 6   race_white                     564 non-null    float64
 7   race_black                     564 non-null    float64
 8   race_ai_alaskan                564 non-null    float64
 9   race_asian                     564 non-null    float64
 10  race_hawaiian                  564 non-null    float64
 11  race_other                     564 non-null    float64
 12  race_hispanic                  564 non-null    flo

### Preprocess Merged Census and Bike Infrastructure Data

In [36]:
# Set neighborhood labels
def set_neighborhood(tract):
    if tract in dorchester:
        return "Dorchester"
    elif tract in roxbury:
        return "Roxbury"
    elif tract in jp:
        return "Jamaica Plain"
    else:
        return ''

census_bike['neighborhood'] = census_bike['tract'].apply(set_neighborhood)

In [37]:
# Reorder columns
leading_cols = ['tract', 'year', 'neighborhood']
census_bike = census_bike[leading_cols + [col for col in census_bike.columns if col not in leading_cols]]

In [39]:
# Save combined file
# census_bike.to_csv(data_folder + "census_bike_merged.csv", index=False)

In [40]:
census_bike.head()

Unnamed: 0,tract,year,neighborhood,total_population,male,female,median_age,race_white,race_black,race_ai_alaskan,...,renters_owners_pre-2000_ratio,BFBL,BL,BLSL,SBL,SLM,SLMTC,SUP,total_built_all_lanes,cumulative_length
0,1001,2010,Dorchester,5617.0,0.481,0.519,23.9,0.063,0.638,0.012,...,0.700662,0.0,2493.815844,0.0,0.0,563.192706,0.0,0.0,3057.00855,3167.046929
1,1001,2011,Dorchester,5657.0,0.48,0.52,24.5,0.109,0.619,0.014,...,0.774003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3167.046929
2,1001,2012,Dorchester,6189.0,0.451,0.549,25.2,0.074,0.701,0.004,...,1.260708,0.0,851.779668,0.0,0.0,684.141817,0.0,0.0,1535.921485,4702.968414
3,1001,2013,Dorchester,6295.0,0.433,0.567,27.3,0.112,0.727,0.001,...,1.236902,0.0,36.450628,0.0,0.0,0.0,0.0,0.0,36.450628,4739.419042
4,1001,2014,Dorchester,6371.0,0.455,0.545,31.2,0.131,0.75,0.001,...,0.94685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4739.419042


## Preliminary Analysis

## Key Questions

## Scope Refinement and Limitations