In [1]:

# Data anaysis
import pandas as pd
import numpy as np

# Plotting and Correlation Maths
import seaborn as sns
import scipy as sci

# Simple model development
import sklearn as sk

import matplotlib.pyplot as plt

import plotly as px


import os

In [42]:
df = pd.read_csv('../data/interim/merged_df.csv')
df

Unnamed: 0,date,name,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,age_range,race,ethnicity_desc,accidental_exposure,victim_od_drug_id,susp_od_drug_desc,naloxone_administered,administration_id,survive,dose_unit
0,2018-01-07,Allegheny County,26.2,-4.2,10.1,18.1,-16.6,0.6,-3.8,57.2,...,30 - 39,BLACK,Unknown,N,3288,HEROIN,N,0,Y,0
1,2018-01-09,Allegheny County,35.8,28.0,32.5,34.7,24.1,26.8,26.6,78.9,...,40 - 49,WHITE,NON-HISPANIC OR NOT LATINO,N,527,HEROIN,Y,244,Y,0
2,2018-01-14,Allegheny County,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,...,40 - 49,WHITE,NON-HISPANIC OR NOT LATINO,N,800,HEROIN,Y,454,Y,0
3,2018-01-14,Allegheny County,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,...,20 - 24,WHITE,NON-HISPANIC OR NOT LATINO,N,804,HEROIN,Y,457,Y,0
4,2018-01-14,Allegheny County,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,...,30 - 39,WHITE,NON-HISPANIC OR NOT LATINO,N,799,HEROIN,Y,452,Y,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5308,2024-04-13,Allegheny County,59.2,44.5,51.1,59.2,37.4,47.9,33.0,51.5,...,20 - 24,WHITE,NON-HISPANIC OR NOT LATINO,N,55826,HEROIN,Y,25971,Y,4
5309,2024-04-13,Allegheny County,59.2,44.5,51.1,59.2,37.4,47.9,33.0,51.5,...,20 - 24,WHITE,NON-HISPANIC OR NOT LATINO,N,55825,FENTANYL,Y,25971,Y,4
5310,2024-04-13,Allegheny County,59.2,44.5,51.1,59.2,37.4,47.9,33.0,51.5,...,20 - 24,WHITE,NON-HISPANIC OR NOT LATINO,N,55825,FENTANYL,Y,25972,Y,4
5311,2024-04-14,Allegheny County,81.1,45.0,64.1,79.9,39.9,62.9,39.5,42.6,...,20 - 24,WHITE,NON-HISPANIC OR NOT LATINO,N,55849,HEROIN,Y,25981,Y,4


In [43]:
df.columns

Index(['date', 'name', 'tempmax', 'tempmin', 'temp', 'feelslikemax',
       'feelslikemin', 'feelslike', 'dew', 'humidity', 'precip', 'precipcover',
       'preciptype', 'snow', 'snowdepth', 'cloudcover', 'uvindex',
       'severerisk', 'moonphase', 'icon', 'dose_count', 'incident_id',
       'incident_time', 'day', 'incident_county_name', 'incident_state',
       'victim_id', 'gender_desc', 'age_range', 'race', 'ethnicity_desc',
       'accidental_exposure', 'victim_od_drug_id', 'susp_od_drug_desc',
       'naloxone_administered', 'administration_id', 'survive', 'dose_unit'],
      dtype='object')

# Dropping Duplicates

We can see that on the incident_id there are some duplicates that exist.  These need to be subsetted such that if the victim and the incident match, then it should be dropped keeping the first.

This will allow for instances where multiple drugs are found to be sifted out while preserving multiple people at a single event.

In [44]:
df.duplicated(subset=['victim_id', 'incident_id'])

0       False
1       False
2       False
3       False
4       False
        ...  
5308     True
5309     True
5310     True
5311    False
5312    False
Length: 5313, dtype: bool

In [45]:
print(f'There are a total of {sum(df.duplicated(subset=['victim_id', 'incident_id']))} duplicated values.')

There are a total of 1629 duplicated values.


Drop the duplicated values but keep the first of them, since the event did happen.

In [46]:
df2 = df.drop_duplicates(subset=['victim_id', 'incident_id'], keep='first')
df2

Unnamed: 0,date,name,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,age_range,race,ethnicity_desc,accidental_exposure,victim_od_drug_id,susp_od_drug_desc,naloxone_administered,administration_id,survive,dose_unit
0,2018-01-07,Allegheny County,26.2,-4.2,10.1,18.1,-16.6,0.6,-3.8,57.2,...,30 - 39,BLACK,Unknown,N,3288,HEROIN,N,0,Y,0
1,2018-01-09,Allegheny County,35.8,28.0,32.5,34.7,24.1,26.8,26.6,78.9,...,40 - 49,WHITE,NON-HISPANIC OR NOT LATINO,N,527,HEROIN,Y,244,Y,0
2,2018-01-14,Allegheny County,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,...,40 - 49,WHITE,NON-HISPANIC OR NOT LATINO,N,800,HEROIN,Y,454,Y,0
3,2018-01-14,Allegheny County,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,...,20 - 24,WHITE,NON-HISPANIC OR NOT LATINO,N,804,HEROIN,Y,457,Y,0
4,2018-01-14,Allegheny County,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,...,30 - 39,WHITE,NON-HISPANIC OR NOT LATINO,N,799,HEROIN,Y,452,Y,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5303,2024-04-11,Allegheny County,68.8,58.9,61.6,68.8,58.9,61.6,56.7,84.8,...,30 - 39,BLACK,NON-HISPANIC OR NOT LATINO,N,55808,FENTANYL,Y,25965,U,2
5306,2024-04-12,Allegheny County,59.4,46.8,50.8,59.4,39.8,46.5,46.3,84.5,...,25 - 29,WHITE,NON-HISPANIC OR NOT LATINO,N,55823,HEROIN,Y,25969,Y,4
5307,2024-04-13,Allegheny County,59.2,44.5,51.1,59.2,37.4,47.9,33.0,51.5,...,20 - 24,WHITE,NON-HISPANIC OR NOT LATINO,N,55826,HEROIN,Y,25972,Y,4
5311,2024-04-14,Allegheny County,81.1,45.0,64.1,79.9,39.9,62.9,39.5,42.6,...,20 - 24,WHITE,NON-HISPANIC OR NOT LATINO,N,55849,HEROIN,Y,25981,Y,4


# Dropping Unneeded Columns

Now that we've removed duplicates, we can drop out unneeded columns

In [47]:
# Columns to be dropping as they have no relevance to our work
drop_cols = ['name', 'moonphase', 'icon', 'dose_count', 'incident_id', 'incident_time',
             'day', 'incident_county_name', 'incident_state', 'victim_id',
              'gender_desc','age_range', 'race', 'ethnicity_desc', 'accidental_exposure',
             'victim_od_drug_id', 'susp_od_drug_desc', 'naloxone_administered',
             'administration_id', 'survive', 'dose_unit']


In [48]:
# Drop the drop_cols along the feature axis
df3 = df2.drop(drop_cols, axis=1)
df3

Unnamed: 0,date,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,precipcover,preciptype,snow,snowdepth,cloudcover,uvindex,severerisk
0,2018-01-07,26.2,-4.2,10.1,18.1,-16.6,0.6,-3.8,57.2,0.000,0.00,none,0.0,0.1,43.9,2,0.0
1,2018-01-09,35.8,28.0,32.5,34.7,24.1,26.8,26.6,78.9,0.000,0.00,none,0.0,0.1,66.3,2,0.0
2,2018-01-14,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,0.000,0.00,none,0.0,2.4,20.4,2,0.0
3,2018-01-14,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,0.000,0.00,none,0.0,2.4,20.4,2,0.0
4,2018-01-14,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,0.000,0.00,none,0.0,2.4,20.4,2,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5303,2024-04-11,68.8,58.9,61.6,68.8,58.9,61.6,56.7,84.8,1.526,83.33,rain,0.0,0.0,95.1,3,10.0
5306,2024-04-12,59.4,46.8,50.8,59.4,39.8,46.5,46.3,84.5,0.266,83.33,rain,0.0,0.0,100.0,2,10.0
5307,2024-04-13,59.2,44.5,51.1,59.2,37.4,47.9,33.0,51.5,0.019,12.50,rain,0.0,0.0,54.5,9,10.0
5311,2024-04-14,81.1,45.0,64.1,79.9,39.9,62.9,39.5,42.6,0.001,4.17,rain,0.0,0.0,25.2,8,30.0


# Converting to Numbered Days

The date is non-cyclical, it simply increases.  We need to get which numeric date in the year it is (1 - 365)

In [49]:
# All the dates are in strings
type(df3['date'][0])

str

In [50]:
# Convert the string to a pandas datetime
df3['date'] = pd.to_datetime(df3['date'])

In [51]:
# Double check that they are datetimes
type(df3['date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [52]:
# Convert the pandas datetime to the day of the year (1 - 365)
df3['num_date'] = df3['date'].dt.day_of_year

In [53]:
# Check the output
df3

Unnamed: 0,date,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,precipcover,preciptype,snow,snowdepth,cloudcover,uvindex,severerisk,num_date
0,2018-01-07,26.2,-4.2,10.1,18.1,-16.6,0.6,-3.8,57.2,0.000,0.00,none,0.0,0.1,43.9,2,0.0,7
1,2018-01-09,35.8,28.0,32.5,34.7,24.1,26.8,26.6,78.9,0.000,0.00,none,0.0,0.1,66.3,2,0.0,9
2,2018-01-14,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,0.000,0.00,none,0.0,2.4,20.4,2,0.0,14
3,2018-01-14,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,0.000,0.00,none,0.0,2.4,20.4,2,0.0,14
4,2018-01-14,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,0.000,0.00,none,0.0,2.4,20.4,2,0.0,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5303,2024-04-11,68.8,58.9,61.6,68.8,58.9,61.6,56.7,84.8,1.526,83.33,rain,0.0,0.0,95.1,3,10.0,102
5306,2024-04-12,59.4,46.8,50.8,59.4,39.8,46.5,46.3,84.5,0.266,83.33,rain,0.0,0.0,100.0,2,10.0,103
5307,2024-04-13,59.2,44.5,51.1,59.2,37.4,47.9,33.0,51.5,0.019,12.50,rain,0.0,0.0,54.5,9,10.0,104
5311,2024-04-14,81.1,45.0,64.1,79.9,39.9,62.9,39.5,42.6,0.001,4.17,rain,0.0,0.0,25.2,8,30.0,105


# One-hot Encoding

There is only one categorical variable now, the `preciptype`.  This is also strangly listed as multiple things 'rain, snow, ice'.

How should we encode this?

In [54]:
df3['preciptype'].unique()

array(['none', 'rain', 'rain,snow', 'snow', 'rain,snow,ice',
       'rain,freezingrain', 'freezingrain,snow,ice',
       'rain,freezingrain,snow', 'freezingrain', 'freezingrain,snow'],
      dtype=object)

One possible option is to just take every different unique value and make it a one-hot.

We can see from the value_counts() that there are not very many data points that have doubled up th tags with the most common being 'rain, snow'.

In [55]:
df3['preciptype'].value_counts()

preciptype
rain                      1761
none                      1373
rain,snow                  379
snow                       126
rain,freezingrain           12
freezingrain,snow           10
freezingrain                 9
rain,snow,ice                6
freezingrain,snow,ice        4
rain,freezingrain,snow       4
Name: count, dtype: int64

In [56]:
df4 = df3.copy()

In [57]:
df4['preciptype'] = df4['preciptype'].astype('category')
df4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3684 entries, 0 to 5312
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          3684 non-null   datetime64[ns]
 1   tempmax       3684 non-null   float64       
 2   tempmin       3684 non-null   float64       
 3   temp          3684 non-null   float64       
 4   feelslikemax  3684 non-null   float64       
 5   feelslikemin  3684 non-null   float64       
 6   feelslike     3684 non-null   float64       
 7   dew           3684 non-null   float64       
 8   humidity      3684 non-null   float64       
 9   precip        3684 non-null   float64       
 10  precipcover   3684 non-null   float64       
 11  preciptype    3684 non-null   category      
 12  snow          3684 non-null   float64       
 13  snowdepth     3684 non-null   float64       
 14  cloudcover    3684 non-null   float64       
 15  uvindex       3684 non-null   int64        

In [58]:
df4

Unnamed: 0,date,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,precipcover,preciptype,snow,snowdepth,cloudcover,uvindex,severerisk,num_date
0,2018-01-07,26.2,-4.2,10.1,18.1,-16.6,0.6,-3.8,57.2,0.000,0.00,none,0.0,0.1,43.9,2,0.0,7
1,2018-01-09,35.8,28.0,32.5,34.7,24.1,26.8,26.6,78.9,0.000,0.00,none,0.0,0.1,66.3,2,0.0,9
2,2018-01-14,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,0.000,0.00,none,0.0,2.4,20.4,2,0.0,14
3,2018-01-14,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,0.000,0.00,none,0.0,2.4,20.4,2,0.0,14
4,2018-01-14,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,0.000,0.00,none,0.0,2.4,20.4,2,0.0,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5303,2024-04-11,68.8,58.9,61.6,68.8,58.9,61.6,56.7,84.8,1.526,83.33,rain,0.0,0.0,95.1,3,10.0,102
5306,2024-04-12,59.4,46.8,50.8,59.4,39.8,46.5,46.3,84.5,0.266,83.33,rain,0.0,0.0,100.0,2,10.0,103
5307,2024-04-13,59.2,44.5,51.1,59.2,37.4,47.9,33.0,51.5,0.019,12.50,rain,0.0,0.0,54.5,9,10.0,104
5311,2024-04-14,81.1,45.0,64.1,79.9,39.9,62.9,39.5,42.6,0.001,4.17,rain,0.0,0.0,25.2,8,30.0,105


In [60]:
df5 = pd.get_dummies(df4, dtype=int)
df5

Unnamed: 0,date,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,...,preciptype_freezingrain,"preciptype_freezingrain,snow","preciptype_freezingrain,snow,ice",preciptype_none,preciptype_rain,"preciptype_rain,freezingrain","preciptype_rain,freezingrain,snow","preciptype_rain,snow","preciptype_rain,snow,ice",preciptype_snow
0,2018-01-07,26.2,-4.2,10.1,18.1,-16.6,0.6,-3.8,57.2,0.000,...,0,0,0,1,0,0,0,0,0,0
1,2018-01-09,35.8,28.0,32.5,34.7,24.1,26.8,26.6,78.9,0.000,...,0,0,0,1,0,0,0,0,0,0
2,2018-01-14,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,0.000,...,0,0,0,1,0,0,0,0,0,0
3,2018-01-14,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,0.000,...,0,0,0,1,0,0,0,0,0,0
4,2018-01-14,15.3,2.5,9.2,14.6,-6.0,3.5,0.5,68.0,0.000,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5303,2024-04-11,68.8,58.9,61.6,68.8,58.9,61.6,56.7,84.8,1.526,...,0,0,0,0,1,0,0,0,0,0
5306,2024-04-12,59.4,46.8,50.8,59.4,39.8,46.5,46.3,84.5,0.266,...,0,0,0,0,1,0,0,0,0,0
5307,2024-04-13,59.2,44.5,51.1,59.2,37.4,47.9,33.0,51.5,0.019,...,0,0,0,0,1,0,0,0,0,0
5311,2024-04-14,81.1,45.0,64.1,79.9,39.9,62.9,39.5,42.6,0.001,...,0,0,0,0,1,0,0,0,0,0


The other method would be to search each string for values and then for doubled-up values, 'rain, snow', we would set both rain and snow as true, rather than having 'rain, snow' set to true.

The downside to this is that 'rain, snow' correlates well to a temperature region of around 30 - 35 F, which correlates to a specific time of year.  You may not get this kind of inference from marking rain and snow individiaully.