In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import sys

In [2]:
data = pd.read_csv('data_sources/individual_cases_ontario.csv')

In [3]:
data.head()

Unnamed: 0,Row_ID,Accurate_Episode_Date,Case_Reported_Date,Test_Reported_Date,Specimen_Date,Age_Group,Client_Gender,Case_AcquisitionInfo,Outcome1,Outbreak_Related,Reporting_PHU_ID,Reporting_PHU,Reporting_PHU_Address,Reporting_PHU_City,Reporting_PHU_Postal_Code,Reporting_PHU_Website,Reporting_PHU_Latitude,Reporting_PHU_Longitude
0,1,2020-05-28,2020-06-03,2020-06-03,2020-06-02,40s,FEMALE,CC,Resolved,,2253,Peel Public Health,7120 Hurontario Street,Mississauga,L5W 1N4,www.peelregion.ca/health/,43.647471,-79.708893
1,2,2020-05-24,2020-05-25,2020-05-25,2020-05-24,70s,MALE,CC,Resolved,,2253,Peel Public Health,7120 Hurontario Street,Mississauga,L5W 1N4,www.peelregion.ca/health/,43.647471,-79.708893
2,3,2020-04-07,2020-04-14,2020-04-14,2020-04-12,40s,FEMALE,NO KNOWN EPI LINK,Resolved,,2253,Peel Public Health,7120 Hurontario Street,Mississauga,L5W 1N4,www.peelregion.ca/health/,43.647471,-79.708893
3,4,2020-05-23,2020-05-29,2020-05-29,2020-05-28,50s,FEMALE,NO KNOWN EPI LINK,Resolved,,2253,Peel Public Health,7120 Hurontario Street,Mississauga,L5W 1N4,www.peelregion.ca/health/,43.647471,-79.708893
4,5,2020-04-30,2020-05-05,2020-05-03,2020-05-02,70s,MALE,OB,Fatal,Yes,2253,Peel Public Health,7120 Hurontario Street,Mississauga,L5W 1N4,www.peelregion.ca/health/,43.647471,-79.708893


In [4]:
data["Age_Group"].unique()

array(['40s', '70s', '50s', '60s', '30s', '80s', '20s', '<20', '90+',
       'UNKNOWN'], dtype=object)

In [5]:
print(data["Age_Group"].value_counts(dropna=False))

20s        60974
30s        46267
50s        43153
40s        41580
<20        38747
60s        27547
70s        14739
80s        12501
90+         7518
UNKNOWN       60
Name: Age_Group, dtype: int64


In [6]:
# drop rows that have missing or unkown age as there are <50 of them
data = data[data['Age_Group'].notna() & (data['Age_Group'] != "UNKNOWN")]

print(data["Age_Group"].value_counts(dropna=False))

20s    60974
30s    46267
50s    43153
40s    41580
<20    38747
60s    27547
70s    14739
80s    12501
90+     7518
Name: Age_Group, dtype: int64


In [7]:
data["Reporting_PHU"].unique()

array(['Peel Public Health', 'Halton Region Health Department',
       'Grey Bruce Health Unit',
       'Kingston, Frontenac and Lennox & Addington Public Health',
       'Wellington-Dufferin-Guelph Public Health',
       'Toronto Public Health', 'Hamilton Public Health Services',
       'Niagara Region Public Health Department',
       'Region of Waterloo, Public Health',
       'York Region Public Health Services',
       'Simcoe Muskoka District Health Unit',
       'Durham Region Health Department',
       'Haliburton, Kawartha, Pine Ridge District Health Unit',
       'Porcupine Health Unit', 'Thunder Bay District Health Unit',
       'North Bay Parry Sound District Health Unit',
       'Lambton Public Health', 'Renfrew County and District Health Unit',
       'Middlesex-London Health Unit', 'Ottawa Public Health',
       'Haldimand-Norfolk Health Unit', 'Eastern Ontario Health Unit',
       'Windsor-Essex County Health Unit', 'Southwestern Public Health',
       'Peterborough Pub

In [8]:
data["Case_Reported_Date"] = pd.to_datetime(data["Case_Reported_Date"], format="%Y-%m-%d" )

city_mask = ((data["Reporting_PHU"] == "Ottawa Public Health") \
             | (data["Reporting_PHU"] == "Toronto Public Health") \
             | (data["Reporting_PHU"] == "Peel Public Health") \
             | (data["Reporting_PHU"] == "York Region Public Health Services") \
             | (data["Reporting_PHU"] == "Durham Region Health Department") \
             | (data["Reporting_PHU"] == "Halton Region Health Department"))

start_date = datetime.strptime("2020-10-06", "%Y-%m-%d")
end_date = datetime.strptime("2021-02-06", "%Y-%m-%d")

four_month_mask = (start_date <= data["Case_Reported_Date"]) & (data["Case_Reported_Date"] <= end_date)

# Only tests for last 4 months and only Ottawa and Toronto
data_subset = data.loc[city_mask & four_month_mask]

print(data_subset['Case_Reported_Date'].min(), data_subset['Case_Reported_Date'].max())

# We can just use this subset instead of original csv
data_subset.to_csv("dimensions/data_subset.csv", index=False)

2020-10-06 00:00:00 2021-02-06 00:00:00


In [9]:
data_subset.head()

Unnamed: 0,Row_ID,Accurate_Episode_Date,Case_Reported_Date,Test_Reported_Date,Specimen_Date,Age_Group,Client_Gender,Case_AcquisitionInfo,Outcome1,Outbreak_Related,Reporting_PHU_ID,Reporting_PHU,Reporting_PHU_Address,Reporting_PHU_City,Reporting_PHU_Postal_Code,Reporting_PHU_Website,Reporting_PHU_Latitude,Reporting_PHU_Longitude
6322,6323,2021-01-29,2021-02-01,2021-02-01,2020-12-24,<20,MALE,CC,Resolved,,2253,Peel Public Health,7120 Hurontario Street,Mississauga,L5W 1N4,www.peelregion.ca/health/,43.647471,-79.708893
6328,6329,2020-03-11,2020-12-05,2020-12-05,2020-09-29,20s,FEMALE,CC,Resolved,,2253,Peel Public Health,7120 Hurontario Street,Mississauga,L5W 1N4,www.peelregion.ca/health/,43.647471,-79.708893
7558,7559,2020-03-23,2020-12-12,2020-12-12,2020-12-11,30s,FEMALE,CC,Resolved,,2270,York Region Public Health Services,17250 Yonge Street,Newmarket,L3Y 6Z1,www.york.ca/wps/portal/yorkhome/health/,44.048023,-79.480239
7887,7888,2020-04-29,2020-10-06,2020-10-06,2020-08-21,50s,MALE,NO KNOWN EPI LINK,Resolved,,2270,York Region Public Health Services,17250 Yonge Street,Newmarket,L3Y 6Z1,www.york.ca/wps/portal/yorkhome/health/,44.048023,-79.480239
26511,26512,2021-01-26,2021-02-06,2021-02-06,2020-09-24,80s,FEMALE,OB,Resolved,Yes,3895,Toronto Public Health,"277 Victoria Street, 5th Floor",Toronto,M5B 1W2,www.toronto.ca/community-people/health-wellnes...,43.656591,-79.379358


In [10]:
patient_dimension = pd.DataFrame(columns=["row_id", "acquisition_group", "outbreak_related", "gender", "age_group"])

In [11]:
y = len(data_subset)
for idx, row in data_subset.iterrows():
    sys.stdout.write("\r{0} out of {1} generated...".format(idx,y))
    patient_row = []
    patient_row.append(row["Row_ID"])
    patient_row.append(row["Case_AcquisitionInfo"])
    patient_row.append(row["Outbreak_Related"])
    patient_row.append(row["Client_Gender"])
    patient_row.append(row["Age_Group"])
    
    patient_dimension.loc[len(patient_dimension)] = patient_row

293084 out of 161219 generated...

In [12]:
patient_dimension.insert(0, "patient_key", np.arange(len(patient_dimension.index)))

In [13]:
# Fill outbreak related nan values with no 
patient_dimension['outbreak_related'].fillna('No', inplace=True)

In [14]:
patient_dimension.head()

Unnamed: 0,patient_key,row_id,acquisition_group,outbreak_related,gender,age_group
0,0,6323,CC,No,MALE,<20
1,1,6329,CC,No,FEMALE,20s
2,2,7559,CC,No,FEMALE,30s
3,3,7888,NO KNOWN EPI LINK,No,MALE,50s
4,4,26512,OB,Yes,FEMALE,80s


In [15]:
patient_dimension.to_csv("dimensions/patient_dimension.csv", index=False)