In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.options.display.max_rows = 500
pd.options.display.max_columns = 100

import warnings
warnings.filterwarnings("ignore") 

from datetime import datetime
from IPython.display import display, Markdown

In [2]:
# Run this cell to display all output within each cell in Jupyter Notebook, 
# instead of just the last statement

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
file = '../data/Seattle_Streets.csv'
df = pd.read_csv(file)

In [4]:
df.head()

Unnamed: 0,OBJECTID,ARTCLASS,COMPKEY,UNITID,UNITID2,UNITIDSORT,UNITDESC,STNAME_ORD,XSTRLO,XSTRHI,ARTDESCRIPT,OWNER,STATUS,BLOCKNBR,SPEEDLIMIT,SEGDIR,ONEWAY,ONEWAYDIR,FLOW,SEGLENGTH,SURFACEWIDTH,SURFACETYPE_1,SURFACETYPE_2,INTRLO,DIRLO,INTKEYLO,INTRHI,DIRHI,NATIONHWYSYS,STREETTYPE,PVMTCONDINDX1,PVMTCONDINDX2,TRANCLASS,TRANDESCRIPT,SLOPE_PCT,PVMTCATEGORY,PARKBOULEVARD,SHAPE_Length
0,1,2.0,1006,10.0,120.0,100120.0,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,1ST AVE,SENECA ST,UNIVERSITY ST,Minor Arterial,,INSVC,1200.0,25.0,NW,N,,,306.0,48.0,PCC,AC/PCC,1ST AVE AND SENECA ST,NW,29611.0,1ST AVE AND UNIVERSITY ST,SE,N,Downtown Neighborhood,87.0,62.0,1,PRINCIPAL TRANSIT ROUTE,4.0,ART,N,305.96605
1,2,2.0,1009,10.0,150.0,100150.0,1ST AVE BETWEEN PIKE ST AND PINE ST,1ST AVE,PIKE ST,PINE ST,Minor Arterial,,INSVC,1500.0,25.0,NW,N,,,426.0,104.0,AC/PCC,PCC,1ST AVE AND PIKE ST,NW,29593.0,1ST AVE AND PINE ST,SE,N,Downtown Neighborhood,57.0,58.0,1,PRINCIPAL TRANSIT ROUTE,5.0,ART,N,426.031562
2,3,0.0,1032,15.0,80.0,150080.0,1ST AVE N BETWEEN VALLEY UPPER ST AND ALOHA ST,1ST AVE N,VALLEY UPPER ST,ALOHA ST,Not Designated,,INSVC,800.0,20.0,N,N,,,297.0,0.0,PCC,,1ST AVE N AND VALLEY UPPER ST,N,28897.0,1ST AVE N AND ALOHA ST,S,N,Neighborhood Yield Street,0.0,0.0,0,NOT DESIGNATED,17.0,NON-ART,N,297.147592
3,4,0.0,1051,15.0,230.0,150230.0,1ST AVE N BETWEEN LYNN ST AND MCGRAW S ST,1ST AVE N,LYNN ST,MCGRAW S ST,Not Designated,,INSVC,2200.0,20.0,N,N,,,175.0,25.0,AC,,1ST AVE N AND LYNN ST,N,28113.0,1ST AVE N AND MCGRAW S ST,S,N,Neighborhood Yield Street,9.0,0.0,0,NOT DESIGNATED,3.0,NON-ART,N,174.804983
4,5,0.0,1060,15.0,282.0,150282.0,1ST AVE N BETWEEN FULTON S ST AND FULTON N ST,1ST AVE N,FULTON S ST,FULTON N ST,Not Designated,,INSVC,2800.0,20.0,N,N,,,73.0,0.0,PCC,,1ST AVE N AND FULTON S ST,N,28051.0,1ST AVE N AND FULTON N ST,S,N,Neighborhood Yield Street,0.0,0.0,0,NOT DESIGNATED,5.0,NON-ART,N,73.110708


In [5]:
df.shape

(23806, 38)

In [13]:
df.columns

Index(['OBJECTID', 'ARTCLASS', 'COMPKEY', 'UNITID', 'UNITID2', 'UNITIDSORT',
       'UNITDESC', 'STNAME_ORD', 'XSTRLO', 'XSTRHI', 'ARTDESCRIPT', 'OWNER',
       'STATUS', 'BLOCKNBR', 'SPEEDLIMIT', 'SEGDIR', 'ONEWAY', 'ONEWAYDIR',
       'FLOW', 'SEGLENGTH', 'SURFACEWIDTH', 'SURFACETYPE_1', 'SURFACETYPE_2',
       'INTRLO', 'DIRLO', 'INTKEYLO', 'INTRHI', 'DIRHI', 'NATIONHWYSYS',
       'STREETTYPE', 'PVMTCONDINDX1', 'PVMTCONDINDX2', 'TRANCLASS',
       'TRANDESCRIPT', 'SLOPE_PCT', 'PVMTCATEGORY', 'PARKBOULEVARD',
       'SHAPE_Length'],
      dtype='object')

# ABOUT THE DATA FEATURES
<b>OBJECTID</b> - ESRI unique identifier

<b>ARTCLASS</b> - Arterial classification code:
    <ol>5 - Interstate Freeway
    <br>4 - State Highway
    <br>3 - Collector Arterial
    <br>2 - Minor Arterial
    <br>1 - Principal Arterial
    <br>0 - Not Designated (not an arterial)</ol>
    
<b>COMPKEY</b> - Primary key of the Street asset table, assigned
by the Hansen asset management system. 

<b>COMPKEY</b> - Primary key of the Street asset table, assigned
by the Hansen asset management system. 
    
<b>UNITID</b> - <i>N/A</i>  

<b>UNITID2</b> - <i>N/A</i>   

<b>UNITIDSORT</b> - Alpha-numeric Hansen unique identifier

<b>UNITIDDESC</b> - Structured description of the Street location

<b>STNAME_ORD</b> - Street segment name   

<b>XSTRLO</b> - Cross street at low end of segment

<b>XSTRHI</b> - Cross street at high end of segment    

<b>ARTDESCRIPT</b> - Arterial class code description   

<b>OWNER</b> - The organization that owns the street, if not the city  

<b>STATUS</b> - Current street status <i> INSVC = in service </i>  

<b>BLOCKNBR</b> - Identification number of block street runs adjacent to

<b>SPEEDLIMIT</b> - Speed limit in MPH    

<b>SEGDIR</b> - Street segment direction   

<b>ONEWAY</b> - One Way Street (Y/N)    

<b>ONEWAYDIR</b> - One Way Street traffic flow direction

<b>FLOW</b> - One Way Street traffic flow classification    

<b>SEGLENGTH</b> - Street segment length in feet   

<b>SURFACEWIDTH</b> - Street segment width in feet     

<b>SURFACETYPE_1</b> - Primary pavement used on Street surface
    <ol>AC - Asphalt Concrete
    <br>PCC - Rigid Pavement
    <br>AC/PCC - Composite AC and PCC
    <br>ST - Bituminous Surface Treatment

<b>SURFACETYPE_2</b> - Secondary pavement used on Street surface     

<b>INTRLO</b> - Description of the intersection location with cross street at high address end of segment    

<b>DIRLO</b> - Relative direction of low address end of segment   

<b>INTKEYLO</b> - Intersection key at low address end of segment     

<b>INTRHI</b> - Description of the intersection location with cross street at high address end of segment      

<b>DIRHI</b> - Direction of high address end of segment   

<b>NATIONHWYSYS</b> - Whether the street is part of the National Highway System (Y/N)      

<b>STREETTYPE</b> - Street type classification (from Seattle Right of Way Improvements Manual) 

<b>PVMTCONDINDX1</b> - Primary pavement condition, out of 100     

<b>PVMTCONDINDX2</b> - Secondary pavement condition, out of 100     

<b>TRANCLASS</b> - Street transit classification   

<b>TRANDESCRIPT</b> - Transit class description

<b>SLOPE_PCT</b> - Street grade in slope percentage 

<b>PVMTCATEGORY</b> - <i>N/A</i> 

<b>PARKBOULEVARD</b> - <i>N/A</i> 
    
<b>SHAPE_Length</b> - ESRI field that stores information about the length of a feature in GIS     

In [12]:
# Consider how many missing and unique values there are in each column
col_na = round(100*(df.isna().sum()/len(df)), 2)
col_nunique = df.nunique()
df_summary = pd.DataFrame({"data_type": df.dtypes,
                           "percent_missing_values": col_na,
                           "total_unique_values": col_nunique}).sort_values(by=["percent_missing_values"],
                                                                            ascending=False)
df_summary

Unnamed: 0,data_type,percent_missing_values,total_unique_values
SURFACETYPE_2,object,97.13,5
SURFACETYPE_1,object,5.03,6
STREETTYPE,object,3.33,12
TRANDESCRIPT,object,0.79,7
SLOPE_PCT,float64,0.08,34
INTRHI,object,0.05,14234
ONEWAY,object,0.05,2
SURFACEWIDTH,float64,0.04,94
INTKEYLO,float64,0.04,14110
INTRLO,object,0.04,14109
