In [1]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch.utils.data import DataLoader
from wordcloud import WordCloud

import numpy as np
import seaborn as sns

from cities.utils.data_loader import ZoningDataset

smoke_test = "CI" in os.environ
n = 20 if smoke_test else 2000
num_samples = 10 if smoke_test else 1000
n_steps = 10 if smoke_test else 2000

from cities.utils.data_grabber import find_repo_root

root = find_repo_root()

In [2]:
parking = pd.read_csv(
    os.path.join(root, "data/minneapolis/processed/zoning_expanded.csv")
)

years_for_analysis = parking["year"].unique()


# parking["parcel_clean"] = pd.to_numeric(parking["parcel"], errors='coerce')
# residential["parcel_clean"] = pd.to_numeric(residential["parcel"], errors='coerce')
# print("Rows with NaN in residential['parcel_clean']:\n", residential[residential["parcel_clean"].isna()])

# parking["parcel"] = pd.to_numeric(parking["parcel"], errors='coerce')

# parking = parking.dropna(subset=["parcel"])

In [3]:
values = pd.read_csv(
    os.path.join(root, "data/minneapolis/sourced/property_values_total.csv")
)

column_years = [int(col) for col in values.columns if col.isdigit()]
print(column_years)


values = values.rename(columns={"parcel_id": "parcel"})

filtered_columns = [values.columns[0]] + [
    col for col in values.columns[1:] if int(col) in years_for_analysis
]

values = values[filtered_columns]


print("missing", values.isna().sum() / len(values))

# systematically around 5% of the data is missing,
display(values[values.isna().any(axis=1)])


# perhaps will be useful as a predictor
parcel = values.iloc[:, 0]
avg_values = values.iloc[:, 1:].mean(axis=1)
values_averaged = pd.DataFrame({"parcel": parcel, "avg_value": avg_values})
display(values_averaged)

# not always row-wise, future question: do we always have value for year when applied for
# for parcels that applied for the permit?

values_long = pd.melt(values, id_vars=["parcel"], var_name="year", value_name="value")
values_long["year"] = pd.to_numeric(values_long["year"])

display(values_long)
print("len values long", len(values_long))
# some redundancy in data, it's payment for model format flexibility

values_long = merged_df = pd.merge(
    values_long, values_averaged, on="parcel", how="left"
)

print("len values long", len(values_long))

display(values_long)

[2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
missing parcel    0.000000
2011      0.050908
2012      0.051982
2013      0.053854
2014      0.050359
2015      0.050571
2016      0.052085
2017      0.049475
2018      0.077542
2019      0.041513
2020      0.040796
dtype: float64


Unnamed: 0,parcel,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
4,2202924410163,,,,,,,,,,0.0
17,1602924430075,,,,,,,,,,
19,3502924340034,,,,,,,,,,
20,2602924210128,,,,,90000.0,,,,,
28,2602924111333,,,,,,,,,17500.0,17500.0
...,...,...,...,...,...,...,...,...,...,...,...
136710,2202924430253,,,0.0,134100.0,147500.0,251200.0,326600.0,,270600.0,270600.0
136722,2202924410381,,,,,,,,,,
136726,2202924140074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
136741,2202924140077,,,,,,,,,,


Unnamed: 0,parcel,avg_value
0,402924140049,104000.0
1,1102924340022,68840.0
2,202824230164,179200.0
3,2102824210141,271250.0
4,2202924410163,0.0
...,...,...
136772,1502824220126,223600.0
136773,502924130133,122250.0
136774,602823220153,202850.0
136775,1402824420191,248300.0


Unnamed: 0,parcel,year,value
0,402924140049,2011,99500.0
1,1102924340022,2011,69300.0
2,202824230164,2011,157000.0
3,2102824210141,2011,242000.0
4,2202924410163,2011,
...,...,...,...
1367765,1502824220126,2020,271500.0
1367766,502924130133,2020,165000.0
1367767,602823220153,2020,298000.0
1367768,1402824420191,2020,314000.0


len values long 1367770
len values long 1367770


Unnamed: 0,parcel,year,value,avg_value
0,402924140049,2011,99500.0,104000.0
1,1102924340022,2011,69300.0,68840.0
2,202824230164,2011,157000.0,179200.0
3,2102824210141,2011,242000.0,271250.0
4,2202924410163,2011,,0.0
...,...,...,...,...
1367765,1502824220126,2020,271500.0,223600.0
1367766,502924130133,2020,165000.0,122250.0
1367767,602823220153,2020,298000.0,202850.0
1367768,1402824420191,2020,314000.0,248300.0


In [4]:
residential = pd.read_csv(
    os.path.join(root, "data/minneapolis/sourced/residential_permits.csv")
)

print(residential["ACREAGE"])

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
4485    0.0
4486    0.0
4487    0.0
4488    0.0
4489    0.0
Name: ACREAGE, Length: 4490, dtype: float64


In [5]:
residential = residential.rename(
    columns={
        "parcel_id": "parcel",
        "YEAR": "year",
        "UNITS": "housing_units",
        "ACREAGE": "acreage",
    }
)

residential["parcel"] = pd.to_numeric(residential["parcel"], errors="coerce")
residential = residential.dropna(subset=["parcel"])
residential["parcel"] = residential["parcel"].astype(int)


residential["year"] = pd.to_numeric(residential["year"], errors="coerce")
residential["year"] = residential["year"].astype(int)


# TODO anything else to drop?
columns_to_drop = ["SDE_ID", "CO_CODE", "CTU_CODE", "CTU_ID", "COCTU_ID", "CTU_NAME"]
residential = residential.drop(columns=columns_to_drop)


print(residential.columns)


display(residential.head())

print(residential.shape)


residential["housing_units"] = residential["housing_units"].astype(int)


residential_for_missingness = residential[["parcel", "year"]].copy()

residential_for_predictors = residential[
    ["parcel", "year", "housing_units", "acreage"]
].copy()


# TODO standardize vars


display(residential_for_missingness.head())

Index(['parcel', 'year', 'TENURE', 'HOUSING_TY', 'HOUSING__1', 'RES_PERMIT',
       'ADDRESS', 'ZIP_CODE', 'ZIP_PLUS_4', 'NAME', 'BUILDINGS',
       'housing_units', 'AGE_RESTRI', 'MEMORY_CAR', 'ASSISTED', 'COM_OFF_RE',
       'acreage', 'SQF', 'PUBLIC_FUN', 'PERMIT_VAL', 'COMMUNITY_'],
      dtype='object')


Unnamed: 0,parcel,year,TENURE,HOUSING_TY,HOUSING__1,RES_PERMIT,ADDRESS,ZIP_CODE,ZIP_PLUS_4,NAME,...,housing_units,AGE_RESTRI,MEMORY_CAR,ASSISTED,COM_OFF_RE,acreage,SQF,PUBLIC_FUN,PERMIT_VAL,COMMUNITY_
0,2302924240095,2016,RNT,MF5,Multifamily (5 units or more),NU,10 2nd St SE,,,,...,72.0,0.0,0.0,0.0,,0.0,0.0,,14158749.0,Urban Center
1,2302924320836,2016,OWN,MF5,Multifamily (5 units or more),TF,100 3rd Ave S,,,The Carlyle,...,1.0,0.0,0.0,0.0,,0.0,0.0,,250000.0,Urban Center
2,2202924410102,2016,RNT,MF5,Multifamily (5 units or more),NU,100 Hennepin Ave,,,,...,156.0,0.0,0.0,0.0,,0.0,0.0,,31925920.0,Urban Center
3,3002923230134,2015,OWN,DTQ,"Duplex, triplex and quad",RM,1000 Essex St SE,,,,...,4.0,0.0,0.0,0.0,,0.0,0.0,,351000.0,Urban Center
4,2402924310002,2010,RNT,MF5,Multifamily (5 units or more),NU,1000 University Ave SE,55414.0,,FloCo Fusion,...,84.0,0.0,0.0,0.0,,0.0,0.0,,10811000.0,Urban Center


(4215, 21)


Unnamed: 0,parcel,year
0,2302924240095,2016
1,2302924320836,2016
2,2202924410102,2016
3,3002923230134,2015
4,2402924310002,2010


In [6]:
values_and_residential = values_long.merge(
    residential_for_missingness, on=["parcel", "year"], how="left", indicator=True
)


print(values_long.shape)
print(values_and_residential.shape)

display(values_and_residential.head())

print(values_and_residential["_merge"].value_counts())

print(1 - 3238 / 1365293, 1 - 42150 / 1339010)

values_long["applied"] = (values_and_residential["_merge"] == "both").astype(int)


display(values_long.head())
print(values_long.mean())

(1367770, 4)
(1368531, 5)


Unnamed: 0,parcel,year,value,avg_value,_merge
0,402924140049,2011,99500.0,104000.0,left_only
1,1102924340022,2011,69300.0,68840.0,left_only
2,202824230164,2011,157000.0,179200.0,left_only
3,2102824210141,2011,242000.0,271250.0,left_only
4,2202924410163,2011,,0.0,left_only


_merge
left_only     1365293
both             3238
right_only          0
Name: count, dtype: int64
0.9976283479077385 0.9685215196301745


Unnamed: 0,parcel,year,value,avg_value,applied
0,402924140049,2011,99500.0,104000.0,0
1,1102924340022,2011,69300.0,68840.0,0
2,202824230164,2011,157000.0,179200.0,0
3,2102824210141,2011,242000.0,271250.0,0
4,2202924410163,2011,,0.0,0


parcel       1.650724e+12
year         2.015500e+03
value        3.099398e+05
avg_value    3.164962e+05
applied      2.365895e-03
dtype: float64


In [7]:
columns_to_standardize = [
    "value",
    "avg_value",
]

standardization_dict = {}

for column in columns_to_standardize:
    values_long[column + "_std"] = (
        values_long[column] - values_long[column].mean()
    ) / values_long[column].std()
    standardization_dict[column] = {
        "mean": values_long[column].mean(),
        "std": values_long[column].std(),
    }


columns_to_factorize = [
    "parcel",
    "year",
]

for column in columns_to_factorize:
    values_long[column + "_id"] = pd.factorize(values_long[column])[0]


display(values_long.head())

print(standardization_dict)

Unnamed: 0,parcel,year,value,avg_value,applied,value_std,avg_value_std,parcel_id,year_id
0,402924140049,2011,99500.0,104000.0,0,-0.103874,-0.106238,0,0
1,1102924340022,2011,69300.0,68840.0,0,-0.118781,-0.123816,1,0
2,202824230164,2011,157000.0,179200.0,0,-0.075492,-0.068641,2,0
3,2102824210141,2011,242000.0,271250.0,0,-0.033536,-0.022621,3,0
4,2202924410163,2011,,0.0,0,,-0.158233,4,0


{'value': {'mean': 309939.8036399642, 'std': 2025904.484805896}, 'avg_value': {'mean': 316496.2241267014, 'std': 2000195.10766166}}


In [8]:
before = len(values_long)

values_long = values_long.dropna()

after = len(values_long)

print(f"dropped {before - after} out of {before} rows")

values_long.to_csv(
    os.path.join(root, "data/minneapolis/processed/values_long.csv"), index=False
)

dropped 70999 out of 1367770 rows


In [9]:
# indentify census tracts

values = pd.read_csv(os.path.join(root, "data/minneapolis/processed/values_long.csv"))

years = sorted(values["year"].unique().astype(int))

for year in years:

    print(year)
    mapping_path = os.path.join(
        root,
        f"data/minneapolis/sourced/parcel_to_census_tract_mappings/parcel_to_census_tract_ids_{year}.csv",
    )

    mapping_df = pd.read_csv(mapping_path)

    mapping_df = mapping_df.rename(
        columns={"parcel_id": "parcel", "census_tract_id": "census_tract"}
    )

    assert mapping_df["parcel"].dtype == values["parcel"].dtype

    values_year = values[values["year"] == year]

    if "census_tract" in values_year.columns:
        values_year = values_year.drop(columns=["census_tract"])

    duplicate_parcels = mapping_df[mapping_df.duplicated("parcel", keep=False)]
    if len(duplicate_parcels) > 0:
        print("duplicate_parels! will keep first!", duplicate_parcels)
        mapping_df = mapping_df.drop_duplicates(subset="parcel", keep="first")

    merged = values_year.merge(
        mapping_df, how="left", left_on="parcel", right_on="parcel"
    )
    merged.set_index(values_year.index, inplace=True)

    merged["census_tract"] = pd.to_numeric(merged["census_tract"], errors="coerce")
    merged["parcel"] = pd.to_numeric(merged["parcel"], errors="coerce")

    merged["census_tract"] = merged["census_tract"].fillna(0).astype(int)
    merged["parcel"] = merged["parcel"].fillna(0).astype(int)

    values.loc[values["year"] == year, "census_tract"] = merged["census_tract"]

values["census_tract"] = pd.to_numeric(values["census_tract"], errors="coerce")
values["census_tract"] = values["census_tract"].fillna(0).astype(int)

values.loc[values["census_tract"] == 0, "census_tract"] = np.nan

print(values["census_tract"].dtype)
print(
    "ratio of parcels without identified tract",
    values["census_tract"].isna().sum() / len(values),
)

before = len(values)

values_long = values.dropna()

after = len(values_long)

print(f"dropped {before - after} out of {before} rows")

values_long["census_tract"] = values["census_tract"].astype(int)

values_long.to_csv(
    os.path.join(root, "data/minneapolis/processed/values_long.csv"), index=False
)

display(values_long.head())

2011
2012
2013
duplicate_parels! will keep first!         Unnamed: 0         parcel  census_tract
53              53  1302924220092   27053102600
86              86  1502924140111   27053002400
87              87  1502924140110   27053002400
92              92  1902923440013   27053104000
79548        79548  1302924220092   27053102600
87806        87806  1502924140110   27053002400
87807        87807  1502924140111   27053002400
101913      101913  1902923440013   27053104000
2014
2015
2016
2017
2018
duplicate_parels! will keep first!         Unnamed: 0         parcel  census_tract
11              11  3602924440191   27053107400
12              12  3602924440191   27053107400
15              15  3602924440187   27053107400
16              16  3602924440187   27053107400
17              17  3602924440187   27053107400
...            ...            ...           ...
138713      138713   102824110028   27053108800
138714      138714   102824110028   27053108800
138715      138715   10282

Unnamed: 0,parcel,year,value,avg_value,applied,value_std,avg_value_std,parcel_id,year_id,census_tract
0,402924140049,2011,99500.0,104000.0,0,-0.103874,-0.106238,0,0,27053100400
1,1102924340022,2011,69300.0,68840.0,0,-0.118781,-0.123816,1,0,27053001700
2,202824230164,2011,157000.0,179200.0,0,-0.075492,-0.068641,2,0,27053008400
3,2102824210141,2011,242000.0,271250.0,0,-0.033536,-0.022621,3,0,27053111400
4,2702924241493,2011,144500.0,129600.0,0,-0.081662,-0.093439,5,0,27053105201


In [10]:
# tensor prep

# TODO save factorization dicts

# categorical
parcel_id = torch.tensor(values_long["parcel_id"].values, dtype=torch.long)
# census_tract_id = torch.tensor(values_long["census_tract_id"].values, dtype=torch.long)
# neighborhood_id = torch.tensor(values_long["neighborhood_id"].values, dtype=torch.long)
# ward_id = torch.tensor(values_long["ward_id"].values, dtype=torch.long)
# zone_id = torch.tensor(values_long["zone_id"].values, dtype=torch.long)
# limit_id = torch.tensor(values_long["limit_id"].values, dtype=torch.long)
# past_reform = torch.tensor(values_long["past_reform"].values, dtype=torch.long)

# day_id = torch.tensor(values_long["day"].values, dtype=torch.long)
year_id = torch.tensor(values_long["year_id"].values, dtype=torch.long)
# month_id = torch.tensor(values_long["month_id"].values, dtype=torch.long)
# day = torch.tensor(values_long["day"].values, dtype=torch.long)
applied = torch.tensor(values_long["applied"].values, dtype=torch.float)

# continuous


avg_value_original = torch.tensor(values_long["avg_value"].values, dtype=torch.float)
value_original = torch.tensor(values_long["value"].values, dtype=torch.float)

avg_value = torch.tensor(values_long["avg_value_std"].values, dtype=torch.float)
value = torch.tensor(values_long["value_std"].values, dtype=torch.float)


# keep the orginals for interpretable evaluation
# parcel_area_original = torch.tensor(values_long["parcel_area"].values, dtype=torch.float)
# car_parking_original = torch.tensor(values_long["car_parking"].values, dtype=torch.float)
# housing_units_original = torch.tensor(values_long["housing_units"].values, dtype=torch.float)
# distance_to_transit_original = torch.tensor(
#    values_long["distance_to_transit"], dtype=torch.float
# )


# parcel_area = torch.tensor(values_long["parcel_area_std"].values, dtype=torch.float)
# car_parking = torch.tensor(values_long["car_parking_std"].values, dtype=torch.float)
# housing_units = torch.tensor(values_long["housing_units_std"].values, dtype=torch.float)
# limit_con = torch.tensor(values_long["limit_con"].values, dtype=torch.float)
# distance_to_transit = torch.tensor(values_long["distance_to_transit_std"], dtype=torch.float)

categorical = {
    "parcel_id": parcel_id,
    "applied": applied,
    #    "census_tract_id": census_tract_id,
    #   "neighborhood_id": neighborhood_id,
    #   "ward_id": ward_id,
    #   "zone_id": zone_id,
    #   "limit_id": limit_id,
    #   "past_reform": past_reform,
    "year": year_id,
    #   "month": month_id,
    #   "day": day_id,
}

continuous = {
    #    "parcel_area": parcel_area,
    #    "car_parking": car_parking,
    #    "housing_units": housing_units,
    #   "limit_con": limit_con,
    #    "distance_to_transit": distance_to_transit,
    #    "distance_to_transit_original": distance_to_transit_original,
    #    "parcel_area_original": parcel_area_original,
    #    "car_parking_original": car_parking_original,
    #   "housing_units_original": housing_units_original,
    "avg_value": avg_value,
    "value": value,
    "avg_value_original": avg_value_original,
    "value_original": value_original,
}

In [11]:
values_dataset = ZoningDataset(
    categorical, continuous, standardization_dictionary=standardization_dict
)

values_data_path = os.path.join(root, "data/minneapolis/processed/values_dataset.pt")

torch.save(values_dataset, values_data_path)


values_dataset_read = torch.load(values_data_path)

values_loader = DataLoader(
    values_dataset_read, batch_size=4, shuffle=True
)  # , collate_fn=zoning_dataset_read.collate)

for batch in values_loader:
    print(batch)
    break

print(values_dataset_read.n)

{'categorical': {'parcel_id': tensor([121358,  40671,   1542,  72542]), 'applied': tensor([0., 0., 0., 0.]), 'year': tensor([0, 7, 4, 6])}, 'continuous': {'avg_value': tensor([-0.1582, -0.0528, -0.1582, -0.0675]), 'value': tensor([-0.1530, -0.0306, -0.1530, -0.0696]), 'avg_value_original': tensor([     0., 210800.,      0., 181400.]), 'value_original': tensor([     0., 248000.,      0., 169000.])}}
1296771


In [14]:
# #are parcel in parking all in residential?
# print(
#     parking["parcel"].isin(residential["parcel"]).sum()
# )

# merged_df = pd.merge(parking, residential, on="parcel", how="outer")


# merged_df.columns

# #all_parcels_in_residential = len(merged_df) == len(parking)
# #print(all_parcels_in_residential)