## Analysis of Missing Pet Reports and Fireworks Events in the United States

In [41]:
import pandas as pd
import duckdb

from scipy.stats import ttest_ind, describe, levene
import plotly.express as px

import researchpy

### Load daily report counts for United States

In [42]:
daily_report_counts_us = pd.read_csv('./data/daily_report_counts_us.csv', index_col='event_date', parse_dates=True)

### Transforms to weekly counts, add rolling average and delta

In [43]:
weekly_report_counts_us = daily_report_counts_us.groupby(pd.Grouper(freq='7D')).sum()
weekly_report_counts_us['week'] = weekly_report_counts_us.index
weekly_report_counts_us['rolling_avg'] = weekly_report_counts_us['reports'].rolling(5, center=True).mean()

In [44]:
firework_event_weeks = pd.read_csv('./data/firework_event_weeks.csv', index_col='event_week', parse_dates=True)
firework_event_weeks['week'] = firework_event_weeks.index

### Join weekly report data to weekly event data

In [45]:
wfig = px.line(weekly_data,
  x='week',
  y=['percent_of_avg'],
  title='Weekly Missing Pet Report Volume 2019 - 2022',
  width=720,
  labels={
    'value': 'Value',
    'week': 'Week Lost or Found'
  }
)
wfig.show()

In [46]:
weekly_data = duckdb.query('''
  select
    w.week,
    w.reports,
    w.rolling_avg,
    w.reports - w.rolling_avg as diff_from_avg,
    w.reports / w.rolling_avg as percent_of_avg,
    case when f.is_event = 1 then 'Event' else 'No Event' end as is_event,
    coalesce(f.event, '') as event
  from weekly_report_counts_us w
  left join firework_event_weeks f
    on f.week = w.week
  where w.reports - w.rolling_avg is not null
  order by w.week asc
''').df()
weekly_report_fig = px.line(weekly_data,
  x='week',
  y=['reports', 'diff_from_avg'],
  title='Weekly Missing Pet Report Volume 2019 - 2022',
  width=720,
  labels={
    'value': 'Value',
    'week': 'Week Lost or Found'
  }
)

weekly_report_fig.add_annotation(
  x='2021-12-31',
  y='1597',
  text='New Year\'s Eve'
)
weekly_report_fig.add_annotation(
  x='2020-07-03',
  y='2107',
  text='4th of July'
)
weekly_report_fig.show()

In [47]:
weekly_data[weekly_data['is_event']=='Event']['percent_of_avg'].mean()

1.1974131319149135

### Plot distributions

In [48]:
fig = px.box(weekly_data,
  y='diff_from_avg',
  x='is_event',
  points="all",
  color='is_event',
  width=720,
  title='Comparison of Report Volume Distributions between Events and No Events',
  labels={
    'diff_from_avg': 'Difference from Average',
    'is_event': 'Event or No Event'
  }
)
fig.show()

### Test for equal means

In [49]:
event_weeks = weekly_data[weekly_data['is_event']=='Event']
event_weeks_desc = describe(event_weeks['diff_from_avg'])
print(f'Event weeks data sets has {event_weeks_desc.nobs} obs with a mean {event_weeks_desc.mean:.3f} and variance {event_weeks_desc.variance:.3f}')

no_event_weeks = weekly_data[weekly_data['is_event']=='No Event']
no_event_weeks_desc = describe(no_event_weeks['diff_from_avg'])
print(f'No Event weeks data sets has {no_event_weeks_desc.nobs} obs with a mean {no_event_weeks_desc.mean:.3f} and variance {no_event_weeks_desc.variance:.3f}')

lev = levene(event_weeks['diff_from_avg'], no_event_weeks['diff_from_avg'])
print(f'Levene test for equal variance has p-value = {lev.pvalue:.3f} and statistic = {lev.statistic:.3f}')

Event weeks data sets has 10 obs with a mean 311.160 and variance 39198.603
No Event weeks data sets has 203 obs with a mean -14.771 and variance 4825.468
Levene test for equal variance has p-value = 0.000 and statistic = 17.822


In [50]:
researchpy.ttest(event_weeks['diff_from_avg'], no_event_weeks['diff_from_avg'], equal_variances=False)


The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



(        Variable      N        Mean          SD         SE   95% Conf.  \
 0  diff_from_avg   10.0  311.160000  197.986370  62.608787  169.529083   
 1  diff_from_avg  203.0  -14.771429   69.465591   4.875529  -24.384886   
 2       combined  213.0    0.530516  105.059981   7.198591  -13.659469   
 
      Interval  
 0  452.790917  
 1   -5.157971  
 2   14.720502  ,
                             Satterthwaite t-test   results
 0  Difference (diff_from_avg - diff_from_avg) =   325.9314
 1                          Degrees of freedom =     9.1095
 2                                           t =     5.1901
 3                       Two side test p value =     0.0005
 4                      Difference < 0 p value =     0.9997
 5                      Difference > 0 p value =     0.0003
 6                                   Cohen's d =     4.1091
 7                                   Hedge's g =     4.0945
 8                              Glass's delta1 =     1.6462
 9                           