In [1]:
import pandas as pd

local_data_path = "../../dataset_ignore/"
parcels = pd.read_csv(f"{local_data_path}Parcels_2020.csv")
property_assessment = pd.read_csv(f"{local_data_path}property-assessment.csv")

In [2]:
# Join datasets on "PID_LONG" = "PID"

df = parcels.set_index("PID_LONG").join(property_assessment.set_index("PID"))
df.head()

Unnamed: 0,OBJECTID,MAP_PAR_ID,LOC_ID,POLY_TYPE,MAP_NO,SOURCE,PLAN_ID,LAST_EDIT,BND_CHK,NO_MATCH,...,U_BTH_STYLE2,U_BTH_STYLE3,U_KITCH_TYPE,U_KITCH_STYLE,U_HEAT_TYP,U_AC,U_FPLACE,U_INT_FIN,U_INT_CND,U_VIEW
100001000.0,71006,100001000.0,F_782457_2963624,FEE,1.0,ASSESS,,20191107,,N,...,,,,,,,,,,
100002000.0,68430,100002000.0,F_782474_2963634,FEE,1.0,ASSESS,,20191107,,N,...,,,,,,,,,,
100003000.0,71584,100003000.0,F_782492_2963642,FEE,1.0,ASSESS,,20191107,,N,...,,,,,,,,,,
100004000.0,61304,100004000.0,F_782510_2963652,FEE,1.0,ASSESS,,20191107,,N,...,,,,,,,,,,
100005000.0,6734,100005000.0,F_782539_2963640,FEE,1.0,ASSESS,,20191107,,N,...,,,,,,,,,,


Since this DataFrame is a join of two general datasets, we might want to drop the unwanted columns.

In [3]:
print(df.columns)

Index(['OBJECTID', 'MAP_PAR_ID', 'LOC_ID', 'POLY_TYPE', 'MAP_NO', 'SOURCE',
       'PLAN_ID', 'LAST_EDIT', 'BND_CHK', 'NO_MATCH', 'TOWN_ID', 'XCOOR',
       'YCOOR', 'OBJECTID_12', 'WARD', 'PARCEL', 'WPD', 'BATCH',
       'Shape__Length_1', 'Shape__Area_1', 'Shape__Length_12',
       'Shape__Area_12', 'Shape__Area', 'Shape__Length', 'CM_ID', 'GIS_ID',
       'ST_NUM', 'ST_NAME', 'ST_NAME_SUF', 'UNIT_NUM', 'ZIPCODE', 'PTYPE',
       'LU', 'OWN_OCC', 'OWNER', 'MAIL_ADDRESSEE', 'MAIL_ADDRESS', 'MAIL CS',
       'MAIL_ZIPCODE', 'AV_LAND', 'AV_BLDG', 'AV_TOTAL', 'GROSS_TAX',
       'LAND_SF', 'YR_BUILT', 'YR_REMOD', 'GROSS_AREA', 'LIVING_AREA',
       'NUM_FLOORS', 'STRUCTURE_CLASS', 'R_BLDG_STYL', 'R_ROOF_TYP',
       'R_EXT_FIN', 'R_TOTAL_RMS', 'R_BDRMS', 'R_FULL_BTH', 'R_HALF_BTH',
       'R_BTH_STYLE', 'R_BTH_STYLE2', 'R_BTH_STYLE3', 'R_KITCH',
       'R_KITCH_STYLE', 'R_KITCH_STYLE2', 'R_KITCH_STYLE3', 'R_HEAT_TYP',
       'R_AC', 'R_FPLACE', 'R_EXT_CND', 'R_OVRALL_CND', 'R_INT_CND',
 

In [4]:
df = df[['OWNER', 'XCOOR', 'YCOOR', 'ST_NUM', 'ST_NAME', 'ST_NAME_SUF', 'ZIPCODE', 'PTYPE',]].rename(columns={'OWNER': 'NAME', 'XCOOR': 'LATITUDE', 'YCOOR': 'LONGITUDE'}).dropna(subset=['PTYPE']).astype({'ZIPCODE': 'int64', 'PTYPE': 'int64'}, errors='ignore')
df.head()

Unnamed: 0,NAME,LATITUDE,LONGITUDE,ST_NUM,ST_NAME,ST_NAME_SUF,ZIPCODE,PTYPE
100001000.0,PASCUCCI CARLO,782457.0,2963624.0,104 A 104,PUTNAM,ST,2128.0,105
100002000.0,ATANASOV DANIEL,782474.0,2963634.0,197,LEXINGTON,ST,2128.0,105
100003000.0,CHEVARRIA ANA S,782492.0,2963642.0,199,LEXINGTON,ST,2128.0,105
100004000.0,"MADDALENI JAMES E, TS",782510.0,2963652.0,201,LEXINGTON,ST,2128.0,105
100005000.0,DIGIROLAMO JOHN A,782539.0,2963640.0,203,LEXINGTON,ST,2128.0,104


According to the property occupancy codes \[0\], I have picked out the property types (PTYPE) that seem applicable to our categories of interest, stated in \[1\]. 
* Education: 904 (PRIV SCHOOL/COLLEGE), \[937, 943\] (education-related e.g. dormitory, classroom, ...), 976 (SCHOOL), 977 (COLLEGE), 978 (LIBRARY)
* Food access: \[323, 328\] (shopping center, supermarket, restaurant, etc.), 944 (DINING FACILITY/CAFETERIA)
* Healthcare: 309+953 (MEDICAL CLINIC), 954 (MEDICAL OFFICE), 979 (HOSPITAL)
* Recreation: \[362, 364\] (theatres), \[365, 377\] (sport centers, clubs, arenas, etc.), \[380, 384\] (golf/tennis, swimming, marina, etc.), 947 (ATHLETIC/SPORTS CTR), 984 (PUBLIC BEACH)
* Social: 353 (SOCIAL CLUB), 945 (ACTIVITY/SOCIAL CENTER), 
* Commercial: \[300, 399\] (except for those mentioned above)


\[0\] https://data.boston.gov/dataset/property-assessment/resource/d6c1268c-cd83-4dc3-a914-bba1ed59da6d

\[1\] https://github.com/Zayta/CS506Spring2021Repository/tree/master/CityOfBoston_team2/deliverables/deliverable0#project-description

In [5]:
import numpy as np

def map_amenity(ptype):
    """Map PTYPE to essential amenity category, if applicable."""
    if ptype in [904, *range(937, 944), 976, 977, 978]:
        return 'Education'
    if ptype in [*range(323, 329), 944]:
        return 'Food access'
    if ptype in [305, 953, 954, 979]:
        return 'Healthcare'
    if ptype in [*range(362, 365), *range(365, 378), *range(380, 385), 947, 984]:
        return 'Recreation'
    if ptype in [353, 945]:
        return 'Social'
    if ptype in range(300, 400):
        return 'Commercial'
    
    return np.nan

In [6]:
df['CATEGORY'] = [map_amenity(parcel) for parcel in df['PTYPE']]
df['CATEGORY'].unique()

array([nan, 'Food access', 'Commercial', 'Education', 'Social',
       'Recreation', 'Healthcare'], dtype=object)

In [7]:
df = df.drop('PTYPE', axis=1).dropna(subset=['CATEGORY'])
df.head()

Unnamed: 0,NAME,LATITUDE,LONGITUDE,ST_NUM,ST_NAME,ST_NAME_SUF,ZIPCODE,CATEGORY
100056000.0,CABANALV INC,783331.0,2963346.0,254 256,BENNINGTON,ST,2128.0,Food access
100079000.0,FAZIO ENTERPRISES,783751.0,2963716.0,392 -398,CHELSEA,ST,2128.0,Commercial
100085000.0,PEREZ MIRZA,783616.0,2963555.0,300,BENNINGTON,ST,2128.0,Commercial
100086000.0,TRUONG NGHIA,783594.0,2963536.0,296 A298,BENNINGTON,ST,2128.0,Commercial
100087000.0,ROBERTO RITA M,783573.0,2963518.0,294,BENNINGTON,ST,2128.0,Food access


In [8]:
df.to_csv('../../datasets_clean/parcels_sanitized.csv')