In [None]:
# Requirements: run this cell first to install necessary Python libraries

! pip install boto3
! pip install s3fs
! conda install -y -c anaconda psycopg2
! pip install sodapy
! pip install sqlalchemy-redshift

In [1]:
import os
import sys
import pandas as pd

user = os.environ['USER']
sys.path.insert(0, '/Users/{}/Box/DataViz Projects/Utility Code'.format(user))
from utils_io import *

# UrbanSim Data Set Exploration


Pythonic Approach to Managing Socrata Data Assets using SodaPy

**SodaPy source**:
https://github.com/xmunoz/sodapy

**Github Documentation**: [BayAreaMetro/DataServices](https://github.com/BayAreaMetro/DataServices/tree/master/Project-Documentation/mdm)

## Socrata Data Sets

### Parcels 2018

**Documentation**: [Parcels 2018](https://github.com/BayAreaMetro/DataServices/blob/master/Project-Documentation/mdm/land-people-mdm/parcels_2018.md)

**Socrata dataset ID**: fqea-xb6g

**Primary key**: 'joinid'

### UrbanSim Buildings

**Documentation**: [UrbanSim Buildings](https://github.com/BayAreaMetro/DataServices/blob/master/Project-Documentation/mdm/land-people-mdm/buildings.md)

**Socrata dataset ID**: rrrx-2reu

**Primary key**: 'building_id'

**Foreign key**: 'joinid

### UrbanSim Parcels

**Documentation**: [UrbanSim Parcels](https://github.com/BayAreaMetro/DataServices/blob/master/Project-Documentation/mdm/land-people-mdm/urbansim_parcels.md)

**Socrata dataset ID**: 6q7r-gybw

**Primary key**: 'joinid'


### Socrata Credentials

Socrata credentials for this  are managed by Kaya Tollas (ktollas@bayareametro.gov)

# How to use this notebook:

This notebook shows how to identify datasets of interest (based on tags) on Socrata, pull data from Socrata into a pandas DataFrame, and begin exploratory data analysis (EDA)

## 1. Find data sets of interest from Socrata

First pull the Socrata data assets table to get metadata for all data assets.

In [2]:
socrata_asset_table_id = 'rs6b-4exy'
socrata_asset_table = pull_df_from_socrata(socrata_asset_table_id)

pulling data in 1 chunks of 156 rows each
pulling chunk 0


In [None]:
# these are the relevant columns for selecting relevant datasets
keep_cols = ['api_endpoint', 'category', 'dataset_link',
             'endpoint', 'keywords', 'name', 'u_id', 'visits']

socrata_asset_table = socrata_asset_table[keep_cols]
socrata_asset_table.head()

Filter the datasets to the ones of interest by filtering on tag

In [21]:
def filter_datasets_by_tag(filter_tag):
    urbansim_table_data = socrata_asset_table[socrata_asset_table['keywords'].str.contains(filter_tag) == True]
    filtered_datasets = urbansim_table_data[['name', 'u_id', 'keywords']]
    return filtered_datasets

In [22]:
urbansim_datasets = filter_datasets_by_tag('urbansim')
urbansim_datasets

Unnamed: 0,name,u_id,keywords
19,UrbanSim Parcels,6q7r-gybw,"mdm,basis,urbansim"
28,Deprecated - Regional Land Use in 2019 (Draft),9tcv-7ybg,"plu,land use,urbansim"
113,Buildings,rrrx-2reu,"basis,mdm,urbansim"


## 2. Pull these datasets into pandas DataFrames

In [8]:
# pull urbansim parcels and buildings for now
urbansim_buildings_id = 'rrrx-2reu'
urbansim_buildings = pull_df_from_socrata(urbansim_buildings_id)

pulling data in 37 chunks of 100000 rows each
pulling chunk 0
pulling chunk 1
pulling chunk 2
pulling chunk 3
pulling chunk 4
pulling chunk 5
pulling chunk 6
pulling chunk 7
pulling chunk 8
pulling chunk 9
pulling chunk 10
pulling chunk 11
pulling chunk 12
pulling chunk 13
pulling chunk 14
pulling chunk 15
pulling chunk 16
pulling chunk 17
pulling chunk 18
pulling chunk 19
pulling chunk 20
pulling chunk 21
pulling chunk 22
pulling chunk 23
pulling chunk 24
pulling chunk 25
pulling chunk 26
pulling chunk 27
pulling chunk 28
pulling chunk 29
pulling chunk 30
pulling chunk 31
pulling chunk 32
pulling chunk 33
pulling chunk 34
pulling chunk 35
pulling chunk 36


In [None]:
urbansim_buildings.head()

In [15]:
urbansim_buildings.shape

(3655207, 15)

In [16]:
urbansim_parcels_id = '6q7r-gybw'
urbansim_parcels = pull_df_from_socrata(urbansim_parcels_id)

pulling data in 37 chunks of 100000 rows each
pulling chunk 0
pulling chunk 1
pulling chunk 2
pulling chunk 3
pulling chunk 4
pulling chunk 5
pulling chunk 6
pulling chunk 7
pulling chunk 8
pulling chunk 9
pulling chunk 10
pulling chunk 11
pulling chunk 12
pulling chunk 13
pulling chunk 14
pulling chunk 15
pulling chunk 16
pulling chunk 17
pulling chunk 18
pulling chunk 19
pulling chunk 20
pulling chunk 21
pulling chunk 22
pulling chunk 23
pulling chunk 24
pulling chunk 25
pulling chunk 26
pulling chunk 27
pulling chunk 28
pulling chunk 29
pulling chunk 30
pulling chunk 31
pulling chunk 32
pulling chunk 33
pulling chunk 34
pulling chunk 35
pulling chunk 36


In [None]:
urbansim_parcels.head()

In [17]:
urbansim_parcels.shape

(3655207, 7)

## 3. EDA

Your exploratory data analysis steps go here: