## Data preparation

In [1]:
import math
import pandas as pd
import altair as alt

In [2]:
covid_url = 'https://gist.githubusercontent.com/DanielKerrigan/f7baab69fa175bfbd5d10e38ad85b1b4/raw/992c7e2dc01dc5abf92ea4e0cc419e0d073a924b/covid.csv'
df_covid = pd.read_csv(covid_url, low_memory=False)
df_covid['day'] = pd.to_datetime(df_covid['day'])

In [3]:
df_covid.tail()

Unnamed: 0,day,cases,deaths,new_cases,new_deaths,week_avg_new_cases,week_avg_new_deaths
240,2020-10-27,264428,23978,734.0,9.0,644.0,4.857143
241,2020-10-28,265263,23980,835.0,2.0,643.0,4.428571
242,2020-10-29,266321,23989,1058.0,9.0,718.285714,4.857143
243,2020-10-30,267227,23996,906.0,7.0,766.285714,4.714286
244,2020-10-31,267929,24001,702.0,5.0,759.142857,5.428571


This cell parses the day into a datetime and adds columns for the year.

In [4]:
complaints_url = 'https://gist.githubusercontent.com/DanielKerrigan/c0c8bd921a052bf6cdf87343773202ba/raw/31aa3dbcdc4904308480c79f22eb775c27f274e0/complaints_census_by_zip_day.csv'
df_311_zip_day = pd.read_csv(complaints_url, low_memory=False)

df_311_zip_day['day'] = pd.to_datetime(df_311_zip_day['day'])
df_311_zip_day['year'] = df_311_zip_day['day'].dt.year

df_311_zip_day.tail()

Unnamed: 0,zcta,day,num_complaints,num_noise_complaints,geoID,median_earning,full_time_median_earning,full_time_mean_earning,year
88228,11697,2020-10-21,2,0,8600000US11697,64286,83906,104101,2020
88229,11697,2020-10-27,1,0,8600000US11697,64286,83906,104101,2020
88230,11697,2020-10-29,1,0,8600000US11697,64286,83906,104101,2020
88231,11697,2020-10-30,1,0,8600000US11697,64286,83906,104101,2020
88232,11697,2020-10-31,1,1,8600000US11697,64286,83906,104101,2020


Next, we remove rows that don't have a median earning and then convert median earning to an integer.

In [5]:
df_311_zip_day = df_311_zip_day[df_311_zip_day['median_earning'] != '-'].copy()
df_311_zip_day['median_earning'] = df_311_zip_day['median_earning'].astype(int)

This cell calculates the percent difference in number of noise complaints for each zcta from 2019 to 2020

In [6]:
# calculate percent difference

df_311_diff = df_311_zip_day.groupby(['zcta', 'year']).agg({
    'num_noise_complaints': 'sum',
    'median_earning': 'first'
}).reset_index(level='year').pivot(columns='year',
                                   values=['num_noise_complaints', 'median_earning'])

df_311_diff.columns = [
    f'{col}_{year}'
    for year, col in df_311_diff.columns.values
]

df_311_diff['percent_diff'] = ((df_311_diff['2020_num_noise_complaints']
                                - df_311_diff['2019_num_noise_complaints'])
                               / df_311_diff['2019_num_noise_complaints'])

# duplicate median earning columns, drop one
df_311_diff.drop(columns=['2019_median_earning'], inplace=True)
df_311_diff.rename(columns={'2020_median_earning': 'median_earning'}, inplace=True)
df_311_diff.reset_index(level=['zcta'], inplace=True)

Check for outliers.

In [7]:
df_311_diff.nlargest(5, 'percent_diff')

Unnamed: 0,zcta,2019_num_noise_complaints,2020_num_noise_complaints,median_earning,percent_diff
72,10466,2233,66953,32202,28.98343
92,11109,63,573,100308,8.095238
81,10475,239,924,40044,2.866109
63,10457,5303,15659,21808,1.952857
33,10037,1088,3010,41651,1.766544


In [8]:
df_311_diff.nsmallest(5, 'percent_diff')

Unnamed: 0,zcta,2019_num_noise_complaints,2020_num_noise_complaints,median_earning,percent_diff
181,11697,75,12,64286,-0.84
171,11430,4,1,9412,-0.75
44,10282,113,58,162650,-0.486726
18,10022,1641,879,102353,-0.464351
82,11001,94,56,52029,-0.404255


In [9]:
median_diff = df_311_diff['percent_diff'].median()
print(f'Median percent difference = {median_diff:.2%}')

Median percent difference = 50.84%


In [10]:
# remove outliers
outliers = [10466, 11109]
df_311_diff = df_311_diff[~df_311_diff['zcta'].isin(outliers)]

This cell sums the number of complaints by day and adds a column for the date with the year set to 1900. The purpose of this it to make it easier to visualize both years on the same line chart.

In [11]:
# remove outliers
df_311_by_day = df_311_zip_day[~df_311_zip_day['zcta'].isin(outliers)]

df_311_by_day = df_311_by_day.groupby('day').agg({
    'num_noise_complaints': 'sum',
    'num_complaints': 'sum',
    'year': 'first'
}).reset_index(level=['day'])

df_311_by_day['day_no_year'] = df_311_by_day['day'].apply(lambda x: x.replace(year=1900))

df_311_by_day.tail()

Unnamed: 0,day,num_noise_complaints,num_complaints,year,day_no_year
485,2020-10-27,1525,7056,2020,1900-10-27
486,2020-10-28,1456,6659,2020,1900-10-28
487,2020-10-29,1055,6914,2020,1900-10-29
488,2020-10-30,1305,7696,2020,1900-10-30
489,2020-10-31,3014,7932,2020,1900-10-31


This cell calculates the percent difference in number of noise complaints for all zcta's combined between 2019 and 2020.

In [12]:
sum_noise_complaints = df_311_by_day.groupby('year').sum()['num_noise_complaints']

print(f'''Total number of noise complaints by year:
2019: {sum_noise_complaints[2019]}
2020: {sum_noise_complaints[2020]}''')

percent_diff_complaints = ((sum_noise_complaints[2020] - sum_noise_complaints[2019])
                           / sum_noise_complaints[2019])

print(f'\nPercent difference from 2019 to 2020 = {percent_diff_complaints:.2%}')

Total number of noise complaints by year:
2019: 359640
2020: 569771

Percent difference from 2019 to 2020 = 58.43%


This cell calculates the 7-day rolling averages for the number of total complaints and number of noise complaints.

In [13]:
rolling_averages = df_311_by_day.groupby('year').rolling(7)[
    'num_complaints',
    'num_noise_complaints'
].mean().reset_index(level=0, drop=True)

df_311_by_day['week_avg_num_complaints'] = rolling_averages['num_complaints']
df_311_by_day['week_avg_num_noise_complaints'] = rolling_averages['num_noise_complaints']

df_311_by_day

Unnamed: 0,day,num_noise_complaints,num_complaints,year,day_no_year,week_avg_num_complaints,week_avg_num_noise_complaints
0,2019-03-01,809,6597,2019,1900-03-01,,
1,2019-03-02,1203,5185,2019,1900-03-02,,
2,2019-03-03,1180,4975,2019,1900-03-03,,
3,2019-03-04,695,7032,2019,1900-03-04,,
4,2019-03-05,764,7703,2019,1900-03-05,,
...,...,...,...,...,...,...,...
485,2020-10-27,1525,7056,2020,1900-10-27,7259.142857,2335.142857
486,2020-10-28,1456,6659,2020,1900-10-28,7205.142857,2271.285714
487,2020-10-29,1055,6914,2020,1900-10-29,7151.571429,2121.428571
488,2020-10-30,1305,7696,2020,1900-10-30,7131.142857,1886.285714


## Complaint Charts

### Noise complaints

In [14]:
alt.Chart(df_311_by_day, title='7-day avgerage of number of noise complaints by year').mark_line().encode(
    x=alt.X('day_no_year', title='Date'),
    y=alt.Y('week_avg_num_noise_complaints', title='Noise complaints 7-day average'),
    color='year:N',
)

In [15]:
scatter = alt.Chart(title='ZCTA: noise complaints vs. median earning').mark_point().encode(
    x=alt.X('percent_diff', axis=alt.Axis(format='%',
                                          title='Percent change in noise complaints, Mar-Oct 2019 to Mar-Oct 2020')),
    y=alt.Y('median_earning', axis=alt.Axis(format='$~s',
                                            title='Median earning')),
)
# https://github.com/altair-viz/altair/issues/1124
rule = alt.Chart().mark_rule(color="#e15759").encode(
    x='a:Q'
)

alt.layer(
    rule, scatter,
    data=df_311_diff
).transform_calculate(
    a="0"
)

In [16]:
abs_max = math.ceil(max(abs(df_311_diff['percent_diff'].max()),
                        abs(df_311_diff['percent_diff'].min())))

diverging_color = alt.Color(
    'percent_diff',
    scale=alt.Scale(
        scheme='redblue',
        domain=[-abs_max, abs_max],
        domainMid=0
    ),
    legend=alt.Legend(
        title='Percent change in noise complaints, Mar-Oct 2019 to Mar-Oct 2020',
        format='%',
        tickCount=5,
        orient='top',
        gradientLength=400,
        titleLimit=400
    )
)

alt.Chart(df_311_diff).mark_bar(size=4, opacity=1).encode(
    x=alt.X('zcta:N', axis=alt.Axis(labels=False, tickSize=0), sort='-y'),
    y=alt.Y('median_earning', axis=alt.Axis(format='$~s', title='Median earning')),
    color=diverging_color
).properties(width=alt.Step(5))

In [17]:
# https://stackoverflow.com/questions/55923300/how-can-i-make-a-map-using-geojson-data-in-altair
data_geojson_remote = alt.Data(
    url='https://gist.githubusercontent.com/DanielKerrigan/a726b9dd2db50a90b308f7a9915db531/raw/b876beaf7a7dc7e14ee8f1348092590e16cb0a78/nyc-geojson.json',
    format=alt.DataFormat(property='features',type='json')
)
alt.Chart(data_geojson_remote).mark_geoshape(stroke='white').encode(
    color=diverging_color
).transform_lookup(
    lookup='properties.zcta',
    from_=alt.LookupData(data=df_311_diff, key='zcta', fields=['percent_diff'])
).properties(
    width=500,
    height=500
)

### All complaints

The sharp drop at July 2019 is curious.

In [18]:
alt.Chart(df_311_by_day).mark_line().encode(
    x=alt.X('day_no_year', title='Date'),
    y=alt.Y('week_avg_num_complaints', title='311 complaints 7-day average'),
    color='year:N',
)

## COVID Charts

In [19]:
alt.Chart(df_covid).mark_line().encode(
    x=alt.X('day', title='Date'),
    y=alt.Y('week_avg_new_cases', title='New Cases 7-day average')
)