In [1]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch.utils.data import DataLoader
from wordcloud import WordCloud

import numpy as np
import seaborn as sns

from cities.utils.data_loader import ZoningDataset

smoke_test = "CI" in os.environ
n = 20 if smoke_test else 2000
num_samples = 10 if smoke_test else 1000
n_steps = 10 if smoke_test else 2000

from cities.utils.data_grabber import find_repo_root

root = find_repo_root()

In [2]:

residential = pd.read_csv(os.path.join(root, "data/minneapolis/sourced/residential_permits.csv"))

residential = residential.rename(columns={"parcel_id": "parcel",
                                           "YEAR": "year",
                                           "UNITS": "housing_units",
                                           "ACREAGE": "acreage",})

residential["parcel"] = pd.to_numeric(residential["parcel"], errors='coerce')
residential = residential.dropna(subset=["parcel"])
residential["parcel"] = residential["parcel"].astype(int)


residential["year"] = pd.to_numeric(residential["year"], errors='coerce')
residential['year'] = residential['year'].astype(int)




#TODO anything else to drop?
columns_to_drop = ['SDE_ID', 'CO_CODE', 'CTU_CODE', 'CTU_ID', 'COCTU_ID', 'CTU_NAME']
residential = residential.drop(columns=columns_to_drop)

print(residential.columns)


display(residential.head())

print(
residential.shape
)


residential['housing_units'] = residential['housing_units'].astype(int)





residential = residential[residential["year"] <= 2020]


Index(['parcel', 'year', 'TENURE', 'HOUSING_TY', 'HOUSING__1', 'RES_PERMIT',
       'ADDRESS', 'ZIP_CODE', 'ZIP_PLUS_4', 'NAME', 'BUILDINGS',
       'housing_units', 'AGE_RESTRI', 'MEMORY_CAR', 'ASSISTED', 'COM_OFF_RE',
       'acreage', 'SQF', 'PUBLIC_FUN', 'PERMIT_VAL', 'COMMUNITY_'],
      dtype='object')


Unnamed: 0,parcel,year,TENURE,HOUSING_TY,HOUSING__1,RES_PERMIT,ADDRESS,ZIP_CODE,ZIP_PLUS_4,NAME,...,housing_units,AGE_RESTRI,MEMORY_CAR,ASSISTED,COM_OFF_RE,acreage,SQF,PUBLIC_FUN,PERMIT_VAL,COMMUNITY_
0,2302924240095,2016,RNT,MF5,Multifamily (5 units or more),NU,10 2nd St SE,,,,...,72.0,0.0,0.0,0.0,,0.0,0.0,,14158749.0,Urban Center
1,2302924320836,2016,OWN,MF5,Multifamily (5 units or more),TF,100 3rd Ave S,,,The Carlyle,...,1.0,0.0,0.0,0.0,,0.0,0.0,,250000.0,Urban Center
2,2202924410102,2016,RNT,MF5,Multifamily (5 units or more),NU,100 Hennepin Ave,,,,...,156.0,0.0,0.0,0.0,,0.0,0.0,,31925920.0,Urban Center
3,3002923230134,2015,OWN,DTQ,"Duplex, triplex and quad",RM,1000 Essex St SE,,,,...,4.0,0.0,0.0,0.0,,0.0,0.0,,351000.0,Urban Center
4,2402924310002,2010,RNT,MF5,Multifamily (5 units or more),NU,1000 University Ave SE,55414.0,,FloCo Fusion,...,84.0,0.0,0.0,0.0,,0.0,0.0,,10811000.0,Urban Center


(4215, 21)


In [52]:


residential['census_tract'] = np.nan
display(residential.head())

years = sorted(residential['year'].unique().astype(int))
print(years)

Unnamed: 0,parcel,year,TENURE,HOUSING_TY,HOUSING__1,RES_PERMIT,ADDRESS,ZIP_CODE,ZIP_PLUS_4,NAME,...,AGE_RESTRI,MEMORY_CAR,ASSISTED,COM_OFF_RE,acreage,SQF,PUBLIC_FUN,PERMIT_VAL,COMMUNITY_,census_tract
0,2302924240095,2016,RNT,MF5,Multifamily (5 units or more),NU,10 2nd St SE,,,,...,0.0,0.0,0.0,,0.0,0.0,,14158749.0,Urban Center,
1,2302924320836,2016,OWN,MF5,Multifamily (5 units or more),TF,100 3rd Ave S,,,The Carlyle,...,0.0,0.0,0.0,,0.0,0.0,,250000.0,Urban Center,
2,2202924410102,2016,RNT,MF5,Multifamily (5 units or more),NU,100 Hennepin Ave,,,,...,0.0,0.0,0.0,,0.0,0.0,,31925920.0,Urban Center,
3,3002923230134,2015,OWN,DTQ,"Duplex, triplex and quad",RM,1000 Essex St SE,,,,...,0.0,0.0,0.0,,0.0,0.0,,351000.0,Urban Center,
4,2402924310002,2010,RNT,MF5,Multifamily (5 units or more),NU,1000 University Ave SE,55414.0,,FloCo Fusion,...,0.0,0.0,0.0,,0.0,0.0,,10811000.0,Urban Center,


[2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]


In [61]:
year = years[4]

mapping_path = os.path.join(root,
    f"data/minneapolis/sourced/parcel_to_census_tract_mappings/parcel_to_census_tract_ids_{year}.csv")


mapping_df = pd.read_csv(mapping_path)

assert mapping_df['parcel_id'].dtype == residential['parcel'].dtype

mapping_set = set(mapping_df['parcel_id'])
residential_set = set(residential['parcel'])

overlap = mapping_set.intersection(residential_set)
overlap_ratio = len(overlap) / len(residential_set)
print(f"overlap ratio for year {year}", overlap_ratio)

residential_year = residential[residential['year'] == year]
merged = residential_year.merge(mapping_df, how='left', left_on='parcel', right_on='parcel_id')

merged.set_index(residential_year.index, inplace=True)

print(f"nas in merged for year {year}", (merged['census_tract_id'].isna().sum()))


merged['census_tract_id'] = pd.to_numeric(merged['census_tract_id'], errors='coerce')
merged['parcel_id'] = pd.to_numeric(merged['parcel_id'], errors='coerce')

merged['census_tract_id'] = merged['census_tract_id'].fillna(0).astype(int)
merged['parcel_id'] = merged['parcel_id'].fillna(0).astype(int)


residential.loc[residential['year'] == year, 'census_tract'] = merged['census_tract_id']

print("res after", residential.loc[residential['year'] == year, 'census_tract'])


#residential['census_tract'] = residential['census_tract'].astype(int)

#count zeros in tract#print("zeros in tract", (residential['census_tract'] == 0).sum()/residential.shape[0])


overlap ratio for year 2013 0.9543307086614173


ValueError: Length mismatch: Expected 377 rows, received array of length 376