In [None]:
# Author: Alexander Gebreamlak
# Date: November 13, 2024

In [48]:
import pandas as pd
import os
pwd = os.getcwd()
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import openpyxl
import json
import requests
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing


In [49]:
#Q1 Manually created this dataset using Census data from 2021 - https://www12.statcan.gc.ca/census-recensement/index-eng.cfm
# COVID-19 Case data by Public Health Unit is collected until June 4th 2024 - https://data.ontario.ca/dataset/confirmed-positive-cases-of-covid-19-in-ontario/resource/455fd63b-603d-4608-8216-7d8647f43350
# COVID-19 Death data by Public Health Unit is collected until October 27th 2024 - https://www.publichealthontario.ca/en/Data-and-Analysis/Infectious-Disease/Respiratory-Virus-Tool
df = pd.read_excel(pwd + "/data/PHU_Data.xlsx")
df


Unnamed: 0,PHU_unit,PHU_Num,Population,Female_Proportion,Total_Covid-19_Cases,Total_Covid-19_Deaths,Population_Over_65,Avg_Household_Size,Per_Capita_Income_Among_Recipients,Avg_Gross_Rent_For_Renter_Occupied_Dwellings,Visible_Minority_Proportion,In_Labour_Force
0,Algoma,2226,112764,0.508,14202,128,29900,2.2,48040,876,0.034,49840
1,Brant,2227,144937,0.509,16083,171,27955,2.5,50040,1198,0.127,73555
2,Chatham-Kent,2224,104316,0.508,12246,159,24600,2.3,46640,894,0.064,49015
3,Durham,2230,696992,0.512,78487,625,111080,2.8,56750,1506,0.361,367505
4,Eastern Ontario,2258,210276,0.506,22714,340,46960,2.4,50480,978,0.052,104310
5,Grey-Bruce,2233,174301,0.506,13525,163,45735,2.3,55050,1047,0.04,82790
6,Haldimand-Norfolk,2234,116706,0.504,12357,150,27565,2.5,49480,1066,0.041,56295
7,Hamilton,2237,569353,0.509,79516,750,104290,2.5,53750,1233,0.248,290990
8,Halton,2236,596637,0.513,61686,520,95490,2.8,71600,1904,0.351,320175
9,Haliburton Kawartha Pr,2235,189183,0.505,14946,213,55620,2.3,51160,1157,0.037,86520


In [50]:
#Q2 Mapping housing, socio-economic, and demographic factors from dataset above to previous dataset created
df_A4 = pd.read_excel(pwd + "/data/A4_df.xlsx")
df_A4.head()

Q2_df = pd.merge(df, df_A4, on='PHU_Num', how='right') 

#Dropping duplicate column
Q2_df = Q2_df.drop(["PHU_unit"], axis=1)

Q2_df

Unnamed: 0,PHU_Num,Population,Female_Proportion,Total_Covid-19_Cases,Total_Covid-19_Deaths,Population_Over_65,Avg_Household_Size,Per_Capita_Income_Among_Recipients,Avg_Gross_Rent_For_Renter_Occupied_Dwellings,Visible_Minority_Proportion,In_Labour_Force,LTC_Home,First_Case_Num_Days,First_Death_Num_Days,PRIV_BEDS,SEMI_BEDS,THREE_BEDS,BASIC_BEDS,CONCARE_BED,RES_BED,INT_BED,ACCREDITATION,YEAR_RENO,CON_Y,Total_Beds,PER_FEM_LTCR,PER_LTCR<65,PER_LTCR>85,PER_LTCR_DEMENTIA,PER_LTCR_CHF,PR_ANTIP_MED,PR_PREUL,PR_FELL,PR_PHYS_RES,PR_WRS_DEPR,PR_PAIN,PR_IMPRV_FX,PR_WRS_FX,PR_WRS_PAIN,Funded_CMI_2020_2021,Home_Level_CMI_2020_2021,Ownership_type,SSI_2019,SSI_NC_2019,IPAC_2019,IPAC_NC_2019,OAT_2019,OAT_NC_2019,SSI_2020,SSI_NC_2020,IPAC_2020,IPAC_NC_2020_COVID,IPAC_NC_2020,OAT_2020,OAT_NC_2020_COVID,OAT_NC_2020,CCFI_2019,CCFI_2020,RQI_2019,New_D_Retrofit_Beds,A_Beds,B_Beds,C_Beds,D_Upgrade_Beds,ELDCAP_Beds,APRV_SS_BEDS,LTCH_SIZE,LTC_LOC,Chain_ownership,Design,LHIN,City,Health_unit,Licensee,Management_firm,Had_outbreak,Local_incidence,Resident_death,Resident_deaths_per100beds,Had-outbreak_WAVE1,Local_incidence_WAVE1,Resident_deaths_WAVE1,Resident_deaths_per100beds_WAVE1,Had-outbreak_WAVE2,Local_incidence_WAVE2,Resident_deaths_WAVE2,Resident_deaths_per100beds_WAVE2
0,2246,477941.0,0.513,56138.0,754.0,111345.0,2.4,49360.0,1186.0,0.131,233555.0,"albright gardens homes, incorporated",90,377,167.0000,64.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,19.000000,231.0,65.6,9.7,55.9,75.0,11.0,26.6,2.8,15.4,6.7,38.9,1.9,26.419307,35.014827,10.052303,0.911800,1.101700,Non-Profit,3.0,2,1.0,0,0.0,NI,1.0,0,0.0,NI,NI,0.0,NI,NI,7.0,7.0,0.0,231.0,0.0,0.0,0.0,0.0,0.0,N,Large,Urban,Single home,Newer (>50% post-1972),Hamilton Niagara Haldimand Brant,Beamsville,Niagara Region Public Health Department,"Albright Gardens Homes, Incorporated",,YES,"Medium (1,000 - 2,000 cases per 100,000)",1.0,0.4,YES,"Medium (150-299 cases per 100,000 before Sept....",0.0,0.0,YES,Medium local incidence,1.0,0.4
1,3895,2794356.0,0.517,402323.0,5249.0,476985.0,2.4,62050.0,1562.0,0.550,1518420.0,seven oaks,90,90,137.0000,112.000000,0.000000,0.000000,17.0,2.0,0.0,1.0,0.0,32.000000,249.0,67.0,12.6,46.0,49.1,10.1,19.7,1.9,14.2,4.1,24.4,2.0,16.600000,27.400000,9.000000,1.041400,1.141500,Municipal,1.0,0,3.0,3,0.0,NI,0.0,NI,0.0,NI,NI,0.0,NI,NI,14.0,2.0,0.0,0.0,0.0,0.0,249.0,0.0,0.0,Y,Large,Urban,Large chain (10+ facilities),Older (>50% 1972 or earlier),Central East,Scarborough,Toronto Public Health,City of Toronto,,YES,"High (2,000+ cases per 100,000)",41.0,16.5,YES,"High (300+ cases per 100,000 before Sept. 1, 2...",41.0,16.5,YES,High local incidence,0.0,0.0
2,2266,307283.0,0.508,29736.0,251.0,53515.0,2.6,57400.0,1354.0,0.170,169205.0,royal terrace,90,935,61.8663,50.106115,0.000000,14.008993,0.0,0.0,0.0,0.0,0.0,32.000000,67.0,66.0,8.5,52.8,60.0,10.0,23.4,2.0,7.4,9.7,33.2,1.4,23.900000,32.800000,18.200000,1.119700,1.226800,For-Profit,0.0,NI,0.0,NI,0.0,NI,0.0,NI,0.0,NI,NI,0.0,NI,NI,4.0,4.0,0.0,0.0,0.0,67.0,0.0,0.0,0.0,N,Medium,Rural,Single home,Newer (>50% post-1972),Waterloo Wellington,Palmerston,Wellington-Dufferin-Guelph Public Health,Shanti Enterprises Limited,,YES,"Medium (1,000 - 2,000 cases per 100,000)",0.0,0.0,YES,"Medium (150-299 cases per 100,000 before Sept....",0.0,0.0,YES,Medium local incidence,0.0,0.0
3,2246,477941.0,0.513,56138.0,754.0,111345.0,2.4,49360.0,1186.0,0.131,233555.0,royal rose place,90,90,58.0000,38.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,5.000000,96.0,63.1,11.9,29.8,46.6,15.1,20.2,2.1,21.2,0.0,21.3,2.8,20.400000,37.400000,8.300000,0.980100,1.080900,For-Profit,0.0,NI,1.0,0,0.0,NI,0.0,NI,1.0,0,0,0.0,NI,NI,5.0,2.0,0.0,96.0,0.0,0.0,0.0,0.0,0.0,N,Medium,Urban,Single home,Newer (>50% post-1972),Hamilton Niagara Haldimand Brant,Welland,Niagara Region Public Health Department,Jarlette,,YES,"Medium (1,000 - 2,000 cases per 100,000)",20.0,20.8,YES,"Medium (150-299 cases per 100,000 before Sept....",20.0,20.8,YES,Medium local incidence,0.0,0.0
4,2230,696992.0,0.512,78487.0,625.0,111080.0,2.8,56750.0,1506.0,0.361,367505.0,reachview village,90,95,3.0000,24.000000,3.000000,72.000000,0.0,0.0,0.0,1.0,1.0,48.000000,100.0,55.2,10.4,39.9,69.4,12.5,14.5,1.8,23.8,0.0,8.9,0.0,40.900000,32.200000,5.900000,1.009600,1.066200,For-Profit,0.0,NI,1.0,0,0.0,NI,0.0,NI,1.0,0,0,0.0,NI,NI,2.0,3.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,N,Large,Urban,Large chain (10+ facilities),Older (>50% 1972 or earlier),Central East,Uxbridge,Durham Region Health Department,Revera Long Term Care Inc,,YES,"Medium (1,000 - 2,000 cases per 100,000)",17.0,17.0,YES,"Medium (150-299 cases per 100,000 before Sept....",17.0,17.0,YES,Medium local incidence,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521,2233,174301.0,0.506,13525.0,163.0,45735.0,2.3,55050.0,1047.0,0.040,82790.0,hanover care centre,993,1006,6.0000,12.000000,0.000000,24.000000,0.0,0.0,0.0,0.0,0.0,34.871579,41.0,64.9,9.5,51.4,67.2,37.5,20.7,3.4,14.3,10.0,33.1,1.4,7.400000,35.600000,15.200000,1.027600,1.124700,For-Profit,0.0,NI,0.0,NI,0.0,NI,0.0,NI,0.0,NI,NI,0.0,NI,NI,2.0,3.0,0.0,0.0,0.0,41.0,0.0,0.0,0.0,N,Medium,Rural,Single home,Newer (>50% post-1972),South West,Hanover,Grey Bruce Health Unit,Hanover Nursing Home Limited,,NO,"Low (<1,000 cases per 100,000)",0.0,0.0,NO,"Low (<150 cases per 100,000 before Sept. 1, 2020)",0.0,0.0,NO,Low local incidence,0.0,0.0
522,4913,216533.0,0.506,20041.0,259.0,43740.0,2.4,55000.0,975.0,0.063,91500.0,chartwell aylmer long term care residence,997,1013,61.8663,0.000000,0.000000,14.008993,0.0,0.0,0.0,1.0,0.0,34.871579,64.0,70.1,1.9,47.7,62.7,20.6,1.3,0.7,17.8,0.0,24.4,2.6,36.700000,37.700000,8.100000,1.078500,1.138100,For-Profit,0.0,NI,0.0,NI,0.0,NI,0.0,NI,0.0,NI,NI,0.0,NI,NI,2.0,2.0,0.0,64.0,0.0,0.0,0.0,0.0,0.0,N,Medium,Rural,Large chain (10+ facilities),Newer (>50% post-1972),South West,Aylmer,Southwestern Public Health,Chartwell,,YES,"Medium (1,000 - 2,000 cases per 100,000)",0.0,0.0,NO,"Medium (150-299 cases per 100,000 before Sept....",0.0,0.0,YES,Medium local incidence,0.0,0.0
523,2261,202431.0,0.506,24575.0,268.0,43255.0,2.3,53800.0,1023.0,0.056,99750.0,the bignucolo residence,1001,1001,61.8663,50.106115,1.657468,14.008993,0.0,2.0,0.0,1.0,0.0,34.871579,18.0,66.7,0.0,50.0,22.7,8.2,55.7,0.6,17.3,2.9,20.0,3.5,31.500000,57.300000,4.700000,1.016712,1.106821,Non-Profit,0.0,NI,0.0,NI,0.0,NI,0.0,NI,0.0,NI,NI,0.0,NI,NI,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,Y,Small,Urban,Single home,Newer (>50% post-1972),North East,Chapleau,Sudbury & District Health Unit,Chapleau Health Services,,NO,"Low (<1,000 cases per 100,000)",0.0,0.0,NO,"Low (<150 cases per 100,000 before Sept. 1, 2020)",0.0,0.0,NO,Low local incidence,0.0,0.0
524,2249,77338.0,0.497,16338.0,85.0,14590.0,2.4,51160.0,939.0,0.025,36000.0,emo health centre,1046,1059,61.8663,50.106115,1.657468,14.008993,0.0,0.0,0.0,1.0,0.0,34.871579,12.0,62.5,0.0,62.5,46.2,15.4,51.1,2.2,15.6,0.0,41.3,32.0,42.900000,39.800000,8.800000,1.016712,1.106821,Non-Profit,0.0,NI,0.0,NI,0.0,NI,0.0,NI,0.0,NI,NI,0.0,NI,NI,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,N,Small,Rural,Chain (2-9 facilities),Newer (>50% post-1972),North West,Emo,Northwestern Health Unit,Riverside Health Care Facilities Inc.,,NO,"Low (<1,000 cases per 100,000)",0.0,0.0,NO,"Low (<150 cases per 100,000 before Sept. 1, 2020)",0.0,0.0,NO,Low local incidence,0.0,0.0


In [57]:
#Q3

# Calculate the mean COVID-19 death rate per 100 nursing home residents
Mean_C19_Death_rate = (Q2_df['Resident_death'].sum() / Q2_df['Total_Beds'].sum()) * 100

# Add a new column to categorize each facility as 'A' or 'B'
Q2_df['Death_Rate_Category'] = Q2_df.apply(lambda row: 'A' if (row['Resident_deaths_per100beds'] / row['Total_Beds']) * 100 > Mean_C19_Death_rate else 'B', axis=1)

Q3_df = Q2_df

Q3_df.head()

Unnamed: 0,PHU_Num,Population,Female_Proportion,Total_Covid-19_Cases,Total_Covid-19_Deaths,Population_Over_65,Avg_Household_Size,Per_Capita_Income_Among_Recipients,Avg_Gross_Rent_For_Renter_Occupied_Dwellings,Visible_Minority_Proportion,In_Labour_Force,LTC_Home,First_Case_Num_Days,First_Death_Num_Days,PRIV_BEDS,SEMI_BEDS,THREE_BEDS,BASIC_BEDS,CONCARE_BED,RES_BED,INT_BED,ACCREDITATION,YEAR_RENO,CON_Y,Total_Beds,PER_FEM_LTCR,PER_LTCR<65,PER_LTCR>85,PER_LTCR_DEMENTIA,PER_LTCR_CHF,PR_ANTIP_MED,PR_PREUL,PR_FELL,PR_PHYS_RES,PR_WRS_DEPR,PR_PAIN,PR_IMPRV_FX,PR_WRS_FX,PR_WRS_PAIN,Funded_CMI_2020_2021,Home_Level_CMI_2020_2021,Ownership_type,SSI_2019,SSI_NC_2019,IPAC_2019,IPAC_NC_2019,OAT_2019,OAT_NC_2019,SSI_2020,SSI_NC_2020,IPAC_2020,IPAC_NC_2020_COVID,IPAC_NC_2020,OAT_2020,OAT_NC_2020_COVID,OAT_NC_2020,CCFI_2019,CCFI_2020,RQI_2019,New_D_Retrofit_Beds,A_Beds,B_Beds,C_Beds,D_Upgrade_Beds,ELDCAP_Beds,APRV_SS_BEDS,LTCH_SIZE,LTC_LOC,Chain_ownership,Design,LHIN,City,Health_unit,Licensee,Management_firm,Had_outbreak,Local_incidence,Resident_death,Resident_deaths_per100beds,Had-outbreak_WAVE1,Local_incidence_WAVE1,Resident_deaths_WAVE1,Resident_deaths_per100beds_WAVE1,Had-outbreak_WAVE2,Local_incidence_WAVE2,Resident_deaths_WAVE2,Resident_deaths_per100beds_WAVE2,Death_Rate_Category
0,2246,477941.0,0.513,56138.0,754.0,111345.0,2.4,49360.0,1186.0,0.131,233555.0,"albright gardens homes, incorporated",90,377,167.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,231.0,65.6,9.7,55.9,75.0,11.0,26.6,2.8,15.4,6.7,38.9,1.9,26.419307,35.014827,10.052303,0.9118,1.1017,Non-Profit,3.0,2,1.0,0,0.0,NI,1.0,0,0.0,NI,NI,0.0,NI,NI,7.0,7.0,0.0,231.0,0.0,0.0,0.0,0.0,0.0,N,Large,Urban,Single home,Newer (>50% post-1972),Hamilton Niagara Haldimand Brant,Beamsville,Niagara Region Public Health Department,"Albright Gardens Homes, Incorporated",,YES,"Medium (1,000 - 2,000 cases per 100,000)",1.0,0.4,YES,"Medium (150-299 cases per 100,000 before Sept....",0.0,0.0,YES,Medium local incidence,1.0,0.4,B
1,3895,2794356.0,0.517,402323.0,5249.0,476985.0,2.4,62050.0,1562.0,0.55,1518420.0,seven oaks,90,90,137.0,112.0,0.0,0.0,17.0,2.0,0.0,1.0,0.0,32.0,249.0,67.0,12.6,46.0,49.1,10.1,19.7,1.9,14.2,4.1,24.4,2.0,16.6,27.4,9.0,1.0414,1.1415,Municipal,1.0,0,3.0,3,0.0,NI,0.0,NI,0.0,NI,NI,0.0,NI,NI,14.0,2.0,0.0,0.0,0.0,0.0,249.0,0.0,0.0,Y,Large,Urban,Large chain (10+ facilities),Older (>50% 1972 or earlier),Central East,Scarborough,Toronto Public Health,City of Toronto,,YES,"High (2,000+ cases per 100,000)",41.0,16.5,YES,"High (300+ cases per 100,000 before Sept. 1, 2...",41.0,16.5,YES,High local incidence,0.0,0.0,A
2,2266,307283.0,0.508,29736.0,251.0,53515.0,2.6,57400.0,1354.0,0.17,169205.0,royal terrace,90,935,61.8663,50.106115,0.0,14.008993,0.0,0.0,0.0,0.0,0.0,32.0,67.0,66.0,8.5,52.8,60.0,10.0,23.4,2.0,7.4,9.7,33.2,1.4,23.9,32.8,18.2,1.1197,1.2268,For-Profit,0.0,NI,0.0,NI,0.0,NI,0.0,NI,0.0,NI,NI,0.0,NI,NI,4.0,4.0,0.0,0.0,0.0,67.0,0.0,0.0,0.0,N,Medium,Rural,Single home,Newer (>50% post-1972),Waterloo Wellington,Palmerston,Wellington-Dufferin-Guelph Public Health,Shanti Enterprises Limited,,YES,"Medium (1,000 - 2,000 cases per 100,000)",0.0,0.0,YES,"Medium (150-299 cases per 100,000 before Sept....",0.0,0.0,YES,Medium local incidence,0.0,0.0,B
3,2246,477941.0,0.513,56138.0,754.0,111345.0,2.4,49360.0,1186.0,0.131,233555.0,royal rose place,90,90,58.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,96.0,63.1,11.9,29.8,46.6,15.1,20.2,2.1,21.2,0.0,21.3,2.8,20.4,37.4,8.3,0.9801,1.0809,For-Profit,0.0,NI,1.0,0,0.0,NI,0.0,NI,1.0,0,0,0.0,NI,NI,5.0,2.0,0.0,96.0,0.0,0.0,0.0,0.0,0.0,N,Medium,Urban,Single home,Newer (>50% post-1972),Hamilton Niagara Haldimand Brant,Welland,Niagara Region Public Health Department,Jarlette,,YES,"Medium (1,000 - 2,000 cases per 100,000)",20.0,20.8,YES,"Medium (150-299 cases per 100,000 before Sept....",20.0,20.8,YES,Medium local incidence,0.0,0.0,A
4,2230,696992.0,0.512,78487.0,625.0,111080.0,2.8,56750.0,1506.0,0.361,367505.0,reachview village,90,95,3.0,24.0,3.0,72.0,0.0,0.0,0.0,1.0,1.0,48.0,100.0,55.2,10.4,39.9,69.4,12.5,14.5,1.8,23.8,0.0,8.9,0.0,40.9,32.2,5.9,1.0096,1.0662,For-Profit,0.0,NI,1.0,0,0.0,NI,0.0,NI,1.0,0,0,0.0,NI,NI,2.0,3.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,N,Large,Urban,Large chain (10+ facilities),Older (>50% 1972 or earlier),Central East,Uxbridge,Durham Region Health Department,Revera Long Term Care Inc,,YES,"Medium (1,000 - 2,000 cases per 100,000)",17.0,17.0,YES,"Medium (150-299 cases per 100,000 before Sept....",17.0,17.0,YES,Medium local incidence,0.0,0.0,A


In [None]:
#Removing rows before centering and standardizing


In [None]:
# Select only the continuous columns for centering and standardizing
continuous_cols = Q2_df.select_dtypes(include=['float64', 'int64'])

# Centering and standardizing: subtract the mean and divide by the standard deviation
Q2_df_centered_standardized = (continuous_cols - continuous_cols.mean()) / continuous_cols.std()

# Replace the original numerical columns with the centered and standardized versions
Q2_df[continuous_cols.columns] = Q2_df_centered_standardized

Q2_df