In [335]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter 
from scipy import stats
import geopandas as gpd
from shapely.geometry import Point
import folium
from datetime import datetime, timedelta
import re

In [336]:
hub = pd.read_excel("data/Trash Hauler Report - Nov 2017 - Nov 2019 Final.xlsx", skiprows=1)

In [337]:
# #rename the columns
hub.columns = [y.lower() for y in [x.strip(' ') if x[-1:] == " " else re.sub(r'\W', '_', x) for x in hub.columns]]

In [338]:
metro = pd.read_csv("data/hubNashville__311__Service_Requests.csv", low_memory=False)

In [339]:
metro.columns = ['request_number' if "#" in x else re.sub(r'\W+', '_', x).lower() for x in metro.columns]

In [340]:
#This fuction works but it is time consuming 
# inProj = Proj(init='epsg:3857')
# outProj = Proj(init='epsg:4326')
# [transform(x, y) for x in hub.state_plan_x for y in hub.state_plan_y]

In [341]:
hub_geo = pd.merge(hub, metro[['request_number', 'latitude', 'longitude']], how='inner', on='request_number')
hub_geo = hub_geo.drop(['state_plan_x', 'state_plan_y'], axis=1)
hub_geo.loc[:, ('geometry')] = hub_geo.apply(lambda x: Point(x.longitude, x.latitude), axis=1)
hub_geo = gpd.GeoDataFrame(hub_geo, crs = {'init': 'epsg:4326'}, geometry = hub_geo['geometry'])

**Filter the misses**

In [342]:
hub_geo['description'] = hub_geo['description'].astype(str)
hub_geo['request'] = hub_geo['request'].astype(str)

In [343]:
hub_geo['request_description'] = hub_geo.request + hub_geo.description

In [344]:
hub_miss_geo = hub_geo[hub_geo['request_description'].str.contains(r'[Mm]iss', na=False)]

**standardize the address column**

In [345]:
hub_miss_geo['incident_address'] = hub_miss_geo['incident_address'].astype(str).str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [346]:
# split the address into two groups by the first',', the first group is the street type and the second is the content after',' 
matches = [re.search(r'(\w+),\s(.+)', x) if "," in x else re.search(r'.+\s(\w+)()', x) for x in hub_miss_geo.incident_address]

#assign the groups
for col, group in {'address': 1, 'post_type':2}.items():
    hub_miss_geo[col] = [x.group(group) if x != None else None for x in matches]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [347]:
suffix = pd.read_excel('data/street_suffix.xlsx')

In [348]:
suffix = suffix.dropna(how='any',axis=0).iloc[:,1:3].reset_index(drop=True)
suffix.abbrv = suffix.abbrv.str.lower()
suffix.street = suffix.street.str.lower()

In [349]:
suffix.head(1)

Unnamed: 0,abbrv,street
0,aly,alleealleyallyaly


In [350]:
for street in hub_miss_geo.address[hub_miss_geo.address.notnull()][0:2]:
    #print(street)
    for key, value in suffix.street.items():
        if street in value:
            street = suffix.abbrv[key]
            print(street)

pike
tpke
dr
drs


In [351]:
[suffix.abbrv[key] for street in hub_miss_geo.address[hub_miss_geo.address.notnull()][0:2]\
 for key, value in suffix.street.items() if street in value] 

['pike', 'tpke', 'dr', 'drs', 'radl', 'rpd', 'rd']

In [352]:
# strip remove the leading and trailing characters only
hub_miss_geo.loc[:,('incident_address')] = \
    hub_miss_geo['incident_address'].replace(to_replace= r",.+", value='', regex=True).str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [353]:
hub_miss_geo.incident_address.value_counts()

110 george l davis blvd     28
5135 hickory hollow pkwy    21
12546 old hickory blvd      21
3710 n natchez ct           20
802 crescent rd             19
                            ..
3741 hamilton church rd      1
925 chickasaw ave            1
130 donald st                1
571 huntington pkwy          1
340 harrison st              1
Name: incident_address, Length: 11098, dtype: int64

**dedup misses**

In [354]:
hub_miss_geo.trash_route = [str(x) if type(x) != float else x for x in hub_miss_geo.trash_route]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [356]:
hub_miss_geo['pickup_day'] = [int(x[1]) if type(x) != float else x for x in hub_miss_geo.trash_route]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)


In [357]:
hub_miss_geo['weekday'] = [x.weekday() for x in hub_miss_geo.date_opened]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [358]:
hub_miss_geo["differ"] = (hub_miss_geo.weekday - hub_miss_geo.pickup_day + 1) % 7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [359]:
def pickup_date():
    p_date = []
    for x in hub_miss_geo.date_opened:
      for y in hub_miss_geo.differ:
        if math.isnan(y):
            a = y
        else:
            a = x - timedelta(days = y)
        break
      p_date.append(a)
    return p_date

In [360]:
hub_miss_geo['pickup_date'] = pickup_date()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [361]:
dedup_miss = hub_miss_geo.groupby('pickup_date').first().reset_index()