# Reoccurance Notebook 
* How often to we return to the same census tract for Sanitation Requetss 

In [142]:
import intake 
import pandas as pd 
import geopandas as gpd 
import altair as alt

In [143]:
catalog = intake.open_catalog("../catalogs/*.yml")

In [144]:
df = catalog.care311.read()

In [145]:
print(f"there have been {len(df)} requests since {df.createddate.min()}")

there have been 105103 requests since 2015-08-05T10:37:55.000


In [146]:
print(f"of those, {len(df[~df['closeddate'].isnull()])} have been closed")

of those, 94526 have been closed


In [147]:
df['closeddate'] = pd.to_datetime(df.closeddate)
df['createddate'] = pd.to_datetime(df.createddate)

## Create a solve time

In [155]:
## time to solve
df = df.dropna(subset=['createddate','closeddate'])
df['solve_time'] = df[['createddate', 'closeddate']].diff(axis=1)['closeddate']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## Make a chart

In [99]:
df = df.set_index('closeddate')
source = pd.DataFrame(df.groupby([pd.Grouper(freq='M'), 'createdbyuserorganization']).count()['srnumber'])

In [100]:
chart = alt.Chart(source.reset_index()).mark_bar().encode(
    x = alt.X('closeddate:O'),
    y= alt.Y('sum(srnumber)'),
    color='createdbyuserorganization'
).properties(
    title='Number of Homelessness Encampment SRs closed by Month'
)

chart.encoding.y.title = 'Number of SR Encampments'
chart.encoding.x.title = 'Month'
chart

In [104]:
# df = df.set_index('createddate')
source = pd.DataFrame(df.groupby([pd.Grouper(freq='M'), 'createdbyuserorganization']).count()['srnumber'])
chart = alt.Chart(source.reset_index()).mark_bar().encode(
    x = alt.X('createddate:O'),
    y= alt.Y('sum(srnumber)'),
    color='createdbyuserorganization'
).properties(
    title='Number of Homelessness Encampment SRs created by Month'
)

chart.encoding.y.title = 'Number of SR Encampments'
chart.encoding.x.title = 'Month'
chart

In [12]:
tracts = catalog.census_tracts_with_pop.read()

In [13]:
df = df[~df.geometry.isnull()]
print(f"length of DF with no nulls is {len(df)}")
df = df.to_crs(epsg=2229)

length of DF with no nulls is 105032


In [14]:
merged = gpd.sjoin(df, tracts, how="inner", op='intersects')
print(f'length of merged df is {len(merged)}')

  "(%s != %s)" % (left_df.crs, right_df.crs)


length of merged df is 104778


# Analysis 
* now that the dataset is merged, we want to calculate how long it takes for a ticket, on average, to get opened up again in the same tract
* There are two metrics we want to track here 1) `closed->next closure` and then `closed->next opened` 

In [197]:
## closed to next closure
# drop computed column if exists 
if 'closed2closed' in merged:
    merged = merged.drop('closed2closed', axis =1)

def compute_stat(group):
    """
    return the series and use in the apply 
    """
    return group.index.to_series().diff(periods=-1)
    
grouped = merged.sort_values('closeddate', ascending=True).set_index('closeddate').groupby('GEOID10', as_index="False")

s = pd.Series(grouped.apply(compute_stat))
s = abs(s)

In [None]:
# print per tract mean / median 
print(len(grouped))
print(grouped.srnumber.count().mean()
grouped.srnumber.count().median()

In [198]:
closed2closed = s.reset_index().rename({'level_1': 'closeddate', 0: 'closed2closed'}, axis=1)

ValueError: cannot insert closeddate, already exists

In [17]:
merged = merged.reset_index().rename({'level_0': 'closeddate'}, axis=1)

In [26]:
# compute closed to next open 

raw_data = df.reset_index()
def compute_next_open(row):
    """
    For each row, find the next opened request in the df. 
    """
    date = row.loc['closeddate']
    geoid = row.loc['GEOID10']
    x = merged[(merged.createddate > date) & (merged.GEOID10 == geoid)]
    next_date = x.createddate.min()
    if next_date: 
        return next_date - date
    else: 
        return None


In [27]:
s = merged.dropna(subset=['closeddate']).apply(compute_next_open, axis=1)
print(s.head())

5    07:24:41
7    07:23:20
11   07:22:49
14   07:22:56
15   07:23:54
dtype: timedelta64[ns]


In [28]:
## closed to open
# drop computed column if exists 
if 'closed2open' in merged:
    merged = merged.drop('closed2open', axis =1)

merged['closed2open'] = s

In [29]:
print(f'the average closed2open time is {merged.closed2open.mean()}')
print(f'the median closed2open time is {merged.closed2open.median()}')

the average closed2open time is 17 days 00:15:57.376440
the median closed2open time is 4 days 00:48:42.500000


In [30]:
## closed to next closure
# drop computed column if exists 
if 'closed2closed' in merged:
    merged = merged.drop('closed2closed', axis =1)
    
merged = closed2closed.drop(['GEOID10', 'closeddate'], axis = 1).join(merged)

In [170]:
print(f'the mean between is {merged.closed2closed.mean()}')
print(f'the median between is {merged.closed2closed.median()}')

the mean between is 11 days 11:23:55.427348
the median between is 0 days 00:00:16


## Maps / Stats

In [None]:
merged.groupby('GEOID10').describe()

In [132]:
source = (merged.set_index('createddate')
       .closed2closed
       .dropna()
       .dt.days
       .resample('M')
       .median()
)
source.head()

createddate
2015-08-31    0
2015-09-30    0
2015-10-31    0
2015-11-30    0
2015-12-31    0
Freq: M, Name: closed2closed, dtype: int64

In [133]:
source2 = (merged.set_index('createddate')
       .closed2open
       .dropna()
       .dt.days
       .resample('M')
       .median()
)
source2.head()

createddate
2015-08-31    25
2015-09-30    24
2015-10-31    20
2015-11-30    20
2015-12-31    20
Freq: M, Name: closed2open, dtype: int64

In [134]:
df = pd.concat([source, source2], axis=1)

In [136]:
reshaped = df.reset_index().groupby(['createddate']).apply(pd.melt).reset_index()
reshaped = reshaped[reshaped.variable != 'createddate']
reshaped

Unnamed: 0,createddate,level_1,variable,value
1,2015-08-31,1,closed2closed,0
2,2015-08-31,2,closed2open,25
4,2015-09-30,1,closed2closed,0
5,2015-09-30,2,closed2open,24
7,2015-10-31,1,closed2closed,0
...,...,...,...,...
155,2019-11-30,2,closed2open,1
157,2019-12-31,1,closed2closed,0
158,2019-12-31,2,closed2open,1
160,2020-01-31,1,closed2closed,0


In [137]:
chart = alt.Chart(reshaped).mark_line().encode(
    x = alt.X('createddate:T', axis = alt.Axis(title = 'Monthly Average', format = ("%b %Y"))),
    y='value',
    color='variable'
).properties(
    title='Key Metrics Over Time, Monthly Average'
)

chart.encoding.y.title = 'Days'
chart.encoding.x.title = 'Monthly Average'
chart