In [None]:
# Requirements: run this cell first to install necessary Python libraries

! pip install boto3
! pip install s3fs
! conda install -y -c anaconda psycopg2
! pip install sodapy
! pip install sqlalchemy-redshift

In [None]:
import os
import sys
import pandas as pd

user = os.environ['USER']
sys.path.insert(0, '/Users/{}/Box/DataViz Projects/Utility Code'.format(user))
from utils_io import *

# UrbanSim Data Set Exploration


Pythonic Approach to Managing Socrata Data Assets using SodaPy

**SodaPy source**:
https://github.com/xmunoz/sodapy

**Github Documentation**: [BayAreaMetro/DataServices](https://github.com/BayAreaMetro/DataServices/tree/master/Project-Documentation/mdm)

## Socrata Data Sets


### UrbanSim Buildings

**Documentation**: [UrbanSim Buildings](https://github.com/BayAreaMetro/DataServices/blob/master/Project-Documentation/mdm/land-people-mdm/buildings.md)

**Socrata dataset ID**: ahwz-jtst

**Socrata dataset**: [UrbanSim Buildings](https://data.bayareametro.gov/Cadastral/UrbanSim-Parcels/6q7r-gybw)

**Primary key**: `building_id`

**Foreign key**: `joinid`

### UrbanSim Parcels

**Documentation**: [UrbanSim Parcels](https://github.com/BayAreaMetro/DataServices/blob/master/Project-Documentation/mdm/land-people-mdm/urbansim_parcels.md)

**Socrata dataset ID**: 6q7r-gybw

**Socrata dataset**: [UrbanSim Parcels](https://data.bayareametro.gov/Cadastral/UrbanSim-Parcels/6q7r-gybw)

**Primary key**: `joinid`

### General Plan and Zoning 2018

**Socrata dataset ID**: udk3-z2d5

**Socrata dataset**: [General Plan and Zoning](https://data.bayareametro.gov/Land-Use/General-Plan-and-Zoning-2018/udk3-z2d5)

**Primary key**: `zoning_id` (matches `recid` in General Plan and Regional Zoning tables and `RecID` in Codes tables)

**Foreign key**: `joinid`


### General Plan Codes

**Socrata dataset ID**: vzcc-dhby

**Socrata dataset**: [General Plan Codes](https://data.bayareametro.gov/Land-Use/Regional-General-Plan-Codes-2018/vzcc-dhby)

**Primary key**: `RecID` (matches `zoning_id` in General Plan and Zoning 2018 table and `recid` in General Plan and Regional Zoning tables)


### General Plan

**Socrata dataset ID**: cc3g-fj4w

**Socrata dataset**: [General Plan](https://data.bayareametro.gov/Land-Use/View-based-on-Regional-General-Plan-Codes-2018/cc3g-fj4w)

**Primary key**: `recid` (matches `zoning_id` in General Plan and Zoning 2018 table and `RecID` in Codes tables)


### Regional Zoning Codes

**Socrata dataset ID**: qdrp-c5ra

**Socrata dataset**: [Regional Zoning Codes](https://data.bayareametro.gov/Land-Use/Regional-Zoning-Codes-2018/qdrp-c5ra)

**Primary key**: `RecID` (matches `zoning_id` in General Plan and Zoning 2018 table and `recid` in General Plan and Regional Zoning tables)

### Regional Zoning

**Socrata dataset ID**: q2p6-hbrp

**Socrata dataset**: [Regional Zoning](https://data.bayareametro.gov/Land-Use/View-of-Parcels-and-Regional-Zoning-2018/q2p6-hbrp)

**Primary key**: `recid` (matches `zoning_id` in General Plan and Zoning 2018 table and `RecID` in Codes tables)


## Socrata Credentials

Socrata credentials for this notebook are managed by Kaya Tollas (ktollas@bayareametro.gov)

# How to use this notebook:

This notebook shows how to identify datasets of interest (based on tags) on Socrata, pull data from Socrata into a pandas DataFrame, and begin exploratory data analysis (EDA)

## 1. Find data sets of interest from Socrata

First pull the Socrata data assets table to get metadata for all data assets.

In [None]:
socrata_asset_table_id = 'rs6b-4exy'
socrata_asset_table = pull_df_from_socrata(socrata_asset_table_id)

In [None]:
# these are the relevant columns for selecting relevant datasets
keep_cols = ['api_endpoint', 'category', 'dataset_link',
             'endpoint', 'keywords', 'name', 'u_id', 'visits']

socrata_asset_table = socrata_asset_table[keep_cols]
socrata_asset_table.head()

Filter the datasets to the ones of interest by filtering on tag

In [None]:
def filter_datasets_by_tag(filter_tag):
    urbansim_table_data = socrata_asset_table[socrata_asset_table['keywords'].str.contains(filter_tag) == True]
    filtered_datasets = urbansim_table_data[['name', 'u_id', 'keywords']]
    return filtered_datasets

In [None]:
urbansim_datasets = filter_datasets_by_tag('urbansim')
urbansim_datasets

In [None]:
basis_tables = filter_datasets_by_tag('basis')
basis_tables

## 2. Pull these datasets into pandas DataFrames

#### Pull UrbanSim Buildings

In [None]:
urbansim_buildings_id = 'ahwz-jtst'
urbansim_buildings = pull_df_from_socrata(urbansim_buildings_id)

In [None]:
urbansim_buildings.head()

In [None]:
urbansim_buildings.shape  # (3655207, 15)

In [None]:
urbansim_buildings['apn'].nunique()  # 2507764

#### Pull UrbanSim Parcels

In [None]:
urbansim_parcels_id = '6q7r-gybw'
urbansim_parcels = pull_df_from_socrata(urbansim_parcels_id)

In [None]:
urbansim_parcels.head()

In [None]:
urbansim_parcels.shape  # (3655207, 7)

In [None]:
urbansim_parcels['apn'].nunique()  # 2643041

#### Pull General Plan and Zoning 2018

In [None]:
gp_zoning_id = 'udk3-z2d5'
gp_zoning = pull_df_from_socrata(gp_zoning_id)

In [None]:
gp_zoning.head()

In [None]:
gp_zoning.shape  # (2142677, 11)

In [None]:
gp_zoning['joinid'].nunique()  # 2091536

#### Pull General Plan Codes

In [None]:
gp_codes_id = 'vzcc-dhby'
gp_codes = pull_df_from_socrata(gp_codes_id)

In [None]:
gp_codes.head()

In [None]:
gp_codes.shape  # (2240, 15)

In [None]:
gp_codes['recid'].nunique()  # 2240

#### Pull General Plan

In [None]:
general_plan_id = 'cc3g-fj4w'
general_plan = pull_df_from_socrata(general_plan_id)

In [None]:
general_plan.head()

In [None]:
general_plan.shape  # (334271, 16)

In [None]:
general_plan['recid'].nunique() # 147

#### Pull Regional Zoning Codes

In [None]:
rz_codes_id = 'qdrp-c5ra'
rz_codes = pull_df_from_socrata(rz_codes_id)

In [None]:
rz_codes.head()

In [None]:
rz_codes.shape  # (2979, 16)

In [None]:
rz_codes['recid'].nunique() # 2979

#### Pull Regional Zoning

In [None]:
regional_zoning_id = 'q2p6-hbrp'
regional_zoning = pull_df_from_socrata(regional_zoning_id)

In [None]:
regional_zoning.head()

In [None]:
regional_zoning.shape  # (148496, 17)

In [None]:
regional_zoning['recid'].nunique() # 240

## 3. EDA

Your exploratory data analysis steps go here:

In [None]:
jurisdict = 'PA'

In [None]:
jdf = gp_zoning.merge(regional_zoning, left_on='zoning_id', right_on='recid')
jdf.shape