In [1]:
import pandas as pd

# Load clean datasets

In [2]:
ah_df = pd.read_csv('../data/clean/all_housing.csv')
ah_df

Unnamed: 0,OWN_OCC,LU_DESC,UNIT_NUM,ST_NUM,ST_NAME,ZIPCODE,CITY,RES_UNITS,OWNER
0,Y,THREE-FAM DWELLING,,104.0,PUTNAM ST,2128,EAST BOSTON,0,PASCUCCI CARLO
1,Y,THREE-FAM DWELLING,,197.0,LEXINGTON ST,2128,EAST BOSTON,0,SEMBRANO RODERICK
2,N,THREE-FAM DWELLING,,199.0,LEXINGTON ST,2128,EAST BOSTON,0,CHEVARRIA ANA S
3,N,THREE-FAM DWELLING,,201.0,LEXINGTON ST,2128,EAST BOSTON,0,"MADDALENI JAMES E, TS"
4,Y,TWO-FAM DWELLING,,203.0,LEXINGTON ST,2128,EAST BOSTON,0,DIGIROLAMO JOHN A
...,...,...,...,...,...,...,...,...,...
275405,N,STRIP CTR STORES,,2198.0,COMMONWEALTH AV,2135,BRIGHTON,0,GREALISH MARTIN J TRST
275406,N,STRIP CTR STORES,,2199.0,COMMONWEALTH AV,2135,BRIGHTON,0,GREALISH MARTIN J TRST
275407,N,STRIP CTR STORES,,2200.0,COMMONWEALTH AV,2135,BRIGHTON,0,GREALISH MARTIN J TRST
275408,N,STRIP CTR STORES,,2201.0,COMMONWEALTH AV,2135,BRIGHTON,0,GREALISH MARTIN J TRST


In [3]:
ir_df = pd.read_csv('../data/clean/income_restricted_housing.csv')
ir_df

Unnamed: 0,PROJ_NAME,ZIPCODE,SECTION8,RES_UNITS,ST_NUM,ST_NAME,CITY
0,Abbot Street/ Shawmut Ave,2119.0,,16,100.0,Shawmut Ave,Boston
1,Academy Homes I,2119.0,,202,1592.0,Columbus Ave,Roxbury
2,Academy Homes II,2119.0,Y,236,2926.0,Washington St,Roxbury
3,Adams Court Phase A,2126.0,,50,59.0,Msgr Patrick J Lydon Way,Boston
4,Adams Court Phase B,2126.0,,45,,,
...,...,...,...,...,...,...,...
1440,xxxxxxxx Blue Hill Ave,2119.0,,4,491.0,Blue Hill Avenue,Boston
1441,YouthBuild Ruskindale Rd,2136.0,,1,1785.0,Columbus Ave #500,Roxbury
1442,YWCA Berkeley,2116.0,,208,40.0,Berkeley St,Boston
1443,YWCA Boston Units,2116.0,,115,140.0,Clarendon St,Boston


# Step 2. Overlay data between DBs

## Create index to join both dataframes

In [4]:
import re

def create_st_index(row):
    street_name = row["ST_NAME"]
    street_number = row["ST_NUM"]

    if street_name != street_name:
        return None

    street_name = street_name.lower()

    # Remove all non-alphanumeric characters
    street_name = re.sub(r'[^a-z0-9\s]', '', street_name)

    # Remove occurances of all these words from the street name
    remove_words = set(["ave","av", "st", "str", "street", "avenue"])
    street_name = ' '.join([word for word in street_name.split() if word not in remove_words])

    if not street_number or street_number != street_number:
        return street_name
    return f"{int(street_number)} {street_name}"

ah_df["join_idx"] = ah_df.apply(create_st_index, axis=1)
ir_df["join_idx"] = ir_df.apply(create_st_index, axis=1)

# Drop nans
ah_df.dropna(inplace=True, subset=["join_idx"])
ir_df.dropna(inplace=True, subset=["join_idx"])

# Set join_idx as a string
ah_df["join_idx"] = ah_df["join_idx"].astype(str)
ir_df["join_idx"] = ir_df["join_idx"].astype(str)

## Filter for housing whose join_index is on the income restricted dataset

In [5]:
# Tag rows where join_idx is in the other dataframe with new column IS_AFFORDABLE
ah_df["IS_AFFORDABLE"] = ah_df["join_idx"].isin(ir_df["join_idx"])

# Save to csv
ah_df.to_csv("../data/clean/result.csv", index=False)

ah_df

Unnamed: 0,OWN_OCC,LU_DESC,UNIT_NUM,ST_NUM,ST_NAME,ZIPCODE,CITY,RES_UNITS,OWNER,join_idx,IS_AFFORDABLE
0,Y,THREE-FAM DWELLING,,104.0,PUTNAM ST,2128,EAST BOSTON,0,PASCUCCI CARLO,104 putnam,False
1,Y,THREE-FAM DWELLING,,197.0,LEXINGTON ST,2128,EAST BOSTON,0,SEMBRANO RODERICK,197 lexington,False
2,N,THREE-FAM DWELLING,,199.0,LEXINGTON ST,2128,EAST BOSTON,0,CHEVARRIA ANA S,199 lexington,False
3,N,THREE-FAM DWELLING,,201.0,LEXINGTON ST,2128,EAST BOSTON,0,"MADDALENI JAMES E, TS",201 lexington,False
4,Y,TWO-FAM DWELLING,,203.0,LEXINGTON ST,2128,EAST BOSTON,0,DIGIROLAMO JOHN A,203 lexington,False
...,...,...,...,...,...,...,...,...,...,...,...
275405,N,STRIP CTR STORES,,2198.0,COMMONWEALTH AV,2135,BRIGHTON,0,GREALISH MARTIN J TRST,2198 commonwealth,False
275406,N,STRIP CTR STORES,,2199.0,COMMONWEALTH AV,2135,BRIGHTON,0,GREALISH MARTIN J TRST,2199 commonwealth,False
275407,N,STRIP CTR STORES,,2200.0,COMMONWEALTH AV,2135,BRIGHTON,0,GREALISH MARTIN J TRST,2200 commonwealth,False
275408,N,STRIP CTR STORES,,2201.0,COMMONWEALTH AV,2135,BRIGHTON,0,GREALISH MARTIN J TRST,2201 commonwealth,False
