In [3]:
import pandas as pd
import numpy as np

## Clean up building data

Start with the raw 2019 data from the City of Seattle.

Calculate extra fields:
- OSE Building Type: mapping between the City's Building Type and the building types used by OSE. See `city_building_types.csv` for mapping. These are generic categories like "Nonresidential" and "Multifamily".
- OSE Property Use fields: this is a mapping of EPA property use types to the types used by OSE. See `building_activity_types.csv` for mapping. These are more detailed categories like "Hospital" or "Restaurant".
- Property Use Type GFA for Policy fields: the GFA for Policy is the same as the GFA unless it is one of the use types "Data Center" and "Parking" that aren't subject to the policy (then it's 0).
- Total GFA for Policy: sum of the Property Use Type GFA for Policy fields
- PropertyUseType Percent GFA fields: percent of a building's total GFA that is from the given property use type. Zero if the use type is not covered by the policy

In [4]:
raw_building_data = pd.read_csv('seattle_large_building_data_2019.csv')

In [5]:
raw_building_data.columns

Index(['OSEBuildingID', 'TaxParcelIdentificationNumber', 'DataYear',
       'BuildingType', 'BuildingName', 'CouncilDistrictCode', 'Neighborhood',
       'YearBuilt', 'Address', 'City', 'State', 'ZipCode', 'Latitude',
       'Longitude', 'PrimaryPropertyType', 'ListOfAllPropertyUseTypes',
       'EPAPropertyType', 'TotalGHGEmissions', 'ENERGYSTARScore',
       'YearsENERGYSTARCertified', 'SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)',
       'SiteEnergyUse(kBtu)', 'SiteEnergyUseWN(kBtu)', 'SourceEUI(kBtu/sf)',
       'SourceEUIWN(kBtu/sf)', 'GHGEmissions(MetricTonsCO2e)',
       'GHGEmissionsIntensity', 'GHGEmissionsIntensity(kgCO2e/ft2)',
       'NaturalGas(kBtu)', 'NaturalGas(therms)', 'Electricity(kBtu)',
       'Electricity(kWh)', 'SteamUse(kBtu)', 'OtherFuelUse(kBtu)',
       'PropertyGFATotal', 'PropertyGFABuilding(s)', 'PropertyGFAParking',
       'NumberofBuildings', 'NumberofFloors', 'LargestPropertyUseType',
       'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseType',
      

In [6]:
cleaned_building_data = raw_building_data[[
    'OSEBuildingID', 
    'TaxParcelIdentificationNumber', 
    'DataYear',
    'BuildingType', 
    'BuildingName', 
    'NaturalGas(kBtu)', 
    'Electricity(kBtu)', 
    'SteamUse(kBtu)', 
    'PropertyGFATotal', 
    'PropertyGFABuilding(s)', 
    'PropertyGFAParking', 
    'LargestPropertyUseType',
    'LargestPropertyUseTypeGFA', 
    'SecondLargestPropertyUseType',
    'SecondLargestPropertyUseTypeGFA', 
    'ThirdLargestPropertyUseType',
    'ThirdLargestPropertyUseTypeGFA'
]]

In [8]:
# Map OSE Building Type

city_building_types_mapping = pd.read_csv('city_building_types.csv')
building_types = dict(zip(list(city_building_types_mapping['BuildingType (City classification)']), list(city_building_types_mapping['Type (Legislative classification)'])))

In [9]:
building_types

{'NonResidential': 'NonResidential',
 'Nonresidential COS': 'NonResidential',
 'Nonresidential WA': 'NonResidential',
 'Multifamily LR (1-4)': 'Multifamily',
 'Multifamily MR (5-9)': 'Multifamily',
 'Multifamily HR (10+)': 'Multifamily',
 'SPS-District K-12': 'Campus',
 'Campus': 'Campus'}

In [10]:
cleaned_building_data['OSE Building Type'] = cleaned_building_data.apply(lambda building: building_types[building['BuildingType']], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_building_data['OSE Building Type'] = cleaned_building_data.apply(lambda building: building_types[building['BuildingType']], axis=1)


In [12]:
cleaned_building_data[['BuildingType', 'OSE Building Type']].sample(10)

Unnamed: 0,BuildingType,OSE Building Type
2255,Multifamily LR (1-4),Multifamily
1345,Multifamily LR (1-4),Multifamily
2322,NonResidential,NonResidential
215,NonResidential,NonResidential
1140,NonResidential,NonResidential
1001,NonResidential,NonResidential
3369,NonResidential,NonResidential
2271,Multifamily LR (1-4),Multifamily
1175,Nonresidential COS,NonResidential
2110,Multifamily LR (1-4),Multifamily


In [None]:
# Map use types

In [13]:
ose_use_types = pd.read_csv('building_activity_types.csv')
ose_use_types.columns
ose_use_types_mapping = dict(zip(ose_use_types['EPA Building Type'], ose_use_types['OSE Building Type']))

In [14]:
ose_use_types_mapping

{'Hotel': 'Hotel',
 'Police Station': 'Fire/Police Station',
 'Other - Entertainment/Public Assembly': 'Entertainment/Public Assembly',
 'Multifamily Housing': 'Multifamily Housing',
 'Library': 'Services',
 'Fitness Center/Health Club/Gym': 'Recreation',
 'Social/Meeting Hall': 'Entertainment/Public Assembly',
 'Courthouse': 'Other',
 'Prison/Incarceration': 'Other',
 'K-12 School': 'K-12 School',
 'College/University': 'College/University',
 'Office': 'Office',
 'Self-Storage Facility': 'Self-Storage Facility',
 'Other - Mall': 'Retail Store',
 'Senior Care Community': 'Senior Living Community',
 'Medical Office': 'Office',
 'Other': 'Other',
 'Performing Arts': 'Entertainment/Public Assembly',
 'Supermarket/Grocery Store': 'Supermarket/Grocery Store',
 'Hospital (General Medical & Surgical)': 'Hospital',
 'Fire Station': 'Fire/Police Station',
 'Museum': 'Entertainment/Public Assembly',
 'Repair Services (Vehicle, Shoe, Locksmith, etc)': 'Services',
 'Other - Lodging/Residential': '

In [15]:
cleaned_building_data['LargestPropertyUseType OSE'] = cleaned_building_data.apply(lambda building: ose_use_types_mapping[building['LargestPropertyUseType']], axis=1)
cleaned_building_data['SecondLargestPropertyUseType OSE'] = cleaned_building_data.apply(lambda building: ose_use_types_mapping[building['SecondLargestPropertyUseType']], axis=1)
cleaned_building_data['ThirdLargestPropertyUseType OSE'] = cleaned_building_data.apply(lambda building: ose_use_types_mapping[building['ThirdLargestPropertyUseType']], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_building_data['LargestPropertyUseType OSE'] = cleaned_building_data.apply(lambda building: ose_use_types_mapping[building['LargestPropertyUseType']], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_building_data['SecondLargestPropertyUseType OSE'] = cleaned_building_data.apply(lambda building: ose_use_types_mapping[building['SecondLargestPropertyUseType']], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

Se

In [19]:
cleaned_building_data[['LargestPropertyUseType', 'LargestPropertyUseType OSE']].sample(10)

Unnamed: 0,LargestPropertyUseType,LargestPropertyUseType OSE
2322,K-12 School,K-12 School
2952,Other - Mall,Retail Store
3383,Office,Office
1677,Multifamily Housing,Multifamily Housing
895,Multifamily Housing,Multifamily Housing
2598,Bank Branch,Retail Store
1952,Multifamily Housing,Multifamily Housing
1620,Senior Care Community,Senior Living Community
1979,Multifamily Housing,Multifamily Housing
1777,Retail Store,Retail Store


In [23]:
cleaned_building_data[['SecondLargestPropertyUseType', 'SecondLargestPropertyUseType OSE']].sample(10)

Unnamed: 0,SecondLargestPropertyUseType,SecondLargestPropertyUseType OSE
651,Parking,
1902,,
1276,,
689,Retail Store,Retail Store
27,,
523,Office,Office
149,Office,Office
2947,,
2117,,
1154,Parking,


In [27]:
cleaned_building_data[['ThirdLargestPropertyUseType', 'ThirdLargestPropertyUseType OSE']].sample(10)

Unnamed: 0,ThirdLargestPropertyUseType,ThirdLargestPropertyUseType OSE
2249,,
3437,,
3156,,
3362,Parking,
3430,Retail Store,Retail Store
3017,,
3160,Restaurant,Restaurant
2172,,
3094,Retail Store,Retail Store
1079,,


In [None]:
cleaned_building_data['LargestPropertyUseType OSE'].unique()

In [28]:
cleaned_building_data[cleaned_building_data['LargestPropertyUseType OSE'].isnull()]['LargestPropertyUseType']

158         Parking
233     Data Center
239         Parking
247             NaN
283         Parking
           ...     
3524            NaN
3526            NaN
3527            NaN
3529            NaN
3532            NaN
Name: LargestPropertyUseType, Length: 99, dtype: object

In [29]:
# some buildings don't have a largest EPA use type at all
# we can't do anything about these buildings, so we'll drop them

cleaned_building_data = cleaned_building_data[~cleaned_building_data['LargestPropertyUseType'].isnull()]

In [30]:
cleaned_building_data[cleaned_building_data['LargestPropertyUseType'].isnull()]

Unnamed: 0,OSEBuildingID,TaxParcelIdentificationNumber,DataYear,BuildingType,BuildingName,NaturalGas(kBtu),Electricity(kBtu),SteamUse(kBtu),PropertyGFATotal,PropertyGFABuilding(s),...,LargestPropertyUseType,LargestPropertyUseTypeGFA,SecondLargestPropertyUseType,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,OSE Building Type,LargestPropertyUseType OSE,SecondLargestPropertyUseType OSE,ThirdLargestPropertyUseType OSE


In [31]:
# Property Use Type GFA for Policy

cleaned_building_data['LargestPropertyUseType OSE GFA for Policy'] = cleaned_building_data.apply(lambda building: 0 if building['LargestPropertyUseType OSE'] in ('Data Center', 'Parking') else building['LargestPropertyUseTypeGFA'], axis=1)
cleaned_building_data['SecondLargestPropertyUseType OSE GFA for Policy'] = cleaned_building_data.apply(lambda building: 0 if building['SecondLargestPropertyUseType OSE'] in ('Data Center', 'Parking') else building['SecondLargestPropertyUseTypeGFA'], axis=1)
cleaned_building_data['ThirdLargestPropertyUseType OSE GFA for Policy'] = cleaned_building_data.apply(lambda building: 0 if building['ThirdLargestPropertyUseType OSE'] in ('Data Center', 'Parking') else building['ThirdLargestPropertyUseTypeGFA'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_building_data['LargestPropertyUseType OSE GFA for Policy'] = cleaned_building_data.apply(lambda building: 0 if building['LargestPropertyUseType OSE'] in ('Data Center', 'Parking') else building['LargestPropertyUseTypeGFA'], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_building_data['SecondLargestPropertyUseType OSE GFA for Policy'] = cleaned_building_data.apply(lambda building: 0 if building['SecondLargestPropertyUseType OSE'] in ('Data Center', 'Parking') else building['SecondLargestPr

In [32]:
cleaned_building_data[['LargestPropertyUseType OSE GFA for Policy', 'SecondLargestPropertyUseType OSE GFA for Policy', 'ThirdLargestPropertyUseType OSE GFA for Policy']].head()

Unnamed: 0,LargestPropertyUseType OSE GFA for Policy,SecondLargestPropertyUseType OSE GFA for Policy,ThirdLargestPropertyUseType OSE GFA for Policy
0,88434.0,,
1,83880.0,15064.0,4622.0
2,756493.0,138635.0,0.0
3,61320.0,,
4,123445.0,68009.0,0.0


In [33]:
# replace NaN with 0
cleaned_building_data[['LargestPropertyUseType OSE GFA for Policy', 'SecondLargestPropertyUseType OSE GFA for Policy', 'ThirdLargestPropertyUseType OSE GFA for Policy']] = cleaned_building_data[['LargestPropertyUseType OSE GFA for Policy', 'SecondLargestPropertyUseType OSE GFA for Policy', 'ThirdLargestPropertyUseType OSE GFA for Policy']].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_building_data[['LargestPropertyUseType OSE GFA for Policy', 'SecondLargestPropertyUseType OSE GFA for Policy', 'ThirdLargestPropertyUseType OSE GFA for Policy']] = cleaned_building_data[['LargestPropertyUseType OSE GFA for Policy', 'SecondLargestPropertyUseType OSE GFA for Policy', 'ThirdLargestPropertyUseType OSE GFA for Policy']].fillna(0)


In [34]:
cleaned_building_data['Total GFA for Policy'] = cleaned_building_data.apply(lambda building: building['LargestPropertyUseType OSE GFA for Policy'] + building['SecondLargestPropertyUseType OSE GFA for Policy'] + building['ThirdLargestPropertyUseType OSE GFA for Policy'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_building_data['Total GFA for Policy'] = cleaned_building_data.apply(lambda building: building['LargestPropertyUseType OSE GFA for Policy'] + building['SecondLargestPropertyUseType OSE GFA for Policy'] + building['ThirdLargestPropertyUseType OSE GFA for Policy'], axis=1)


In [35]:
cleaned_building_data[cleaned_building_data['Total GFA for Policy'] == 0]

Unnamed: 0,OSEBuildingID,TaxParcelIdentificationNumber,DataYear,BuildingType,BuildingName,NaturalGas(kBtu),Electricity(kBtu),SteamUse(kBtu),PropertyGFATotal,PropertyGFABuilding(s),...,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,OSE Building Type,LargestPropertyUseType OSE,SecondLargestPropertyUseType OSE,ThirdLargestPropertyUseType OSE,LargestPropertyUseType OSE GFA for Policy,SecondLargestPropertyUseType OSE GFA for Policy,ThirdLargestPropertyUseType OSE GFA for Policy,Total GFA for Policy


In [36]:
# Calculate percent for each use type
cleaned_building_data['LargestPropertyUseType Percent GFA'] = cleaned_building_data.apply(lambda building: building['LargestPropertyUseType OSE GFA for Policy'] / building['Total GFA for Policy'], axis=1)
cleaned_building_data['SecondLargestPropertyUseType Percent GFA'] = cleaned_building_data.apply(lambda building: building['SecondLargestPropertyUseType OSE GFA for Policy'] / building['Total GFA for Policy'], axis=1)
cleaned_building_data['ThirdLargestPropertyUseType Percent GFA'] = cleaned_building_data.apply(lambda building: building['ThirdLargestPropertyUseType OSE GFA for Policy'] / building['Total GFA for Policy'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_building_data['LargestPropertyUseType Percent GFA'] = cleaned_building_data.apply(lambda building: building['LargestPropertyUseType OSE GFA for Policy'] / building['Total GFA for Policy'], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_building_data['SecondLargestPropertyUseType Percent GFA'] = cleaned_building_data.apply(lambda building: building['SecondLargestPropertyUseType OSE GFA for Policy'] / building['Total GFA for Policy'], axis=1)
A value is trying to be set on a copy of a slice

In [37]:
# Add size classification

def classify_size(sq_ft):
    """
    Use letter classifications for building size instead of dealing with size ranges (>220k, 90-220k, etc.)
    """
    if sq_ft > 220000:
        return 'A'
    elif sq_ft > 90000:
        return 'B'
    elif sq_ft > 50000:
        return 'C'
    elif sq_ft > 30000:
        return 'D'
    elif sq_ft > 20000:
        return 'E'
    else:
        return 'F'
        
cleaned_building_data['sq_ft_classification'] = cleaned_building_data['Total GFA for Policy'].apply(lambda building: classify_size(building))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_building_data['sq_ft_classification'] = cleaned_building_data['Total GFA for Policy'].apply(lambda building: classify_size(building))


In [38]:
cleaned_building_data.to_csv('cleaned_building_data_with_policy_gfa_8_24.csv')

## Rechecking the data

When we ran this data through the model, our numbers were way off. We need to figure out why that is. Here's some exploring to see where the problem is occuring.

In [None]:
raw_2019_data = pd.read_csv('seattle_large_building_data_2019.csv')
processed_data = pd.read_csv('cleaned_building_data_with_policy_gfa.csv')

In [None]:
rmi_processed_data = pd.read_csv('../rmi_building_analysis_with_new_col_names.csv')

In [None]:
# Let's look to see if there are missing buildings

# buildings not in RMI data but in the raw data
len(set(raw_2019_data['OSEBuildingID']) - set(rmi_processed_data['OSEBuildingID']))

In [None]:
set(raw_2019_data['OSEBuildingID']) - set(rmi_processed_data['OSEBuildingID'])

In [None]:
raw_2019_data.columns

In [None]:
print(raw_2019_data[raw_2019_data['OSEBuildingID'] == 755][['BuildingName', 'TotalGHGEmissions', 'LargestPropertyUseType',
       'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseType',
       'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseType',
       'ThirdLargestPropertyUseTypeGFA']])

OK, so the Medical Dental building is huge (~360K sq ft) and produces a lot of carbon (300K tons)

It's possible this is throwing us off. It may have been removed as an outlier from the RMI dataset.

Let's also look at what's in the RMI but not the public raw dataset.

In [None]:
len(set(rmi_processed_data['OSEBuildingID']) - set(raw_2019_data['OSEBuildingID']))

In [None]:
in_rmi_but_not_public_city_data = list(set(rmi_processed_data['OSEBuildingID']) - set(raw_2019_data['OSEBuildingID']))

In [None]:
rmi_processed_data[rmi_processed_data['OSEBuildingID'].isin(in_rmi_but_not_public_city_data)][['LargestPropertyUseType', 'SecondLargestPropertyUseType', 'ThirdLargestPropertyUseType']].head()

## Create matching public/RMI datasets

Create a copy of the RMI data and the public data (crunched above) that have the exact same buildings (only buildings found in both datasets).