In [None]:
# Author: Alexander Gebreamlak
# Date: November 13, 2024

In [34]:
import pandas as pd
import os
pwd = os.getcwd()
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import openpyxl
import json
import requests
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing


In [None]:
#Q1 Manually created this dataset using Census data from 2021 - https://www12.statcan.gc.ca/census-recensement/index-eng.cfm
# COVID-19 Case data by Public Health Unit is collected until June 4th 2024 - https://data.ontario.ca/dataset/confirmed-positive-cases-of-covid-19-in-ontario/resource/455fd63b-603d-4608-8216-7d8647f43350
# COVID-19 Death data by Public Health Unit is collected until October 27th 2024 - https://www.publichealthontario.ca/en/Data-and-Analysis/Infectious-Disease/Respiratory-Virus-Tool
df = pd.read_excel(pwd + "/data/PHU_Data.xlsx")
df.head()


Unnamed: 0,PHU_unit,PHU_Num,Population,Female_Proportion,Total_Covid-19_Cases,Total_Covid-19_Deaths,Population_Over_65,Avg_Household_Size,Per_Capita_Income_Among_Recipients,Avg_Gross_Rent_For_Renter_Occupied_Dwellings,Visible_Minority_Proportion,In_Labour_Force
0,Algoma,2226,112764,0.508,14202,128,29900,2.2,48040,876,0.034,49840
1,Brant,2227,144937,0.509,16083,171,27955,2.5,50040,1198,0.127,73555
2,Chatham-Kent,2224,104316,0.508,12246,159,24600,2.3,46640,894,0.064,49015
3,Durham,2230,696992,0.512,78487,625,111080,2.8,56750,1506,0.361,367505
4,Eastern Ontario,2258,210276,0.506,22714,340,46960,2.4,50480,978,0.052,104310


In [None]:
#Q2 Mapping housing, socio-economic, and demographic factors from dataset above to previous dataset created
df_A4 = pd.read_excel(pwd + "/data/A4_data.xlsx")
df_A4.head()

Q2_df = pd.merge(df, df_A4, on='PHU_Num', how='outer') 

#Dropping duplicate column
Q2_df = Q2_df.drop(["PHU_unit"], axis=1)

Q2_df.head()

Unnamed: 0,PHU_Num,Population,Female_Proportion,Total_Covid-19_Cases,Total_Covid-19_Deaths,Population_Over_65,Avg_Household_Size,Per_Capita_Income_Among_Recipients,Avg_Gross_Rent_For_Renter_Occupied_Dwellings,Visible_Minority_Proportion,...,Resident_death,Resident_deaths_per100beds,Had-outbreak_WAVE1,Local_incidence_WAVE1,Resident_deaths_WAVE1,Resident_deaths_per100beds_WAVE1,Had-outbreak_WAVE2,Local_incidence_WAVE2,Resident_deaths_WAVE2,Resident_deaths_per100beds_WAVE2
0,2226,112764.0,0.508,14202.0,128.0,29900.0,2.2,48040.0,876.0,0.034,...,-0.541689,-0.59148,YES,"Low (<150 cases per 100,000 before Sept. 1, 2020)",-0.332422,-0.35482,YES,Low local incidence,-0.39682,-0.429826
1,2226,112764.0,0.508,14202.0,128.0,29900.0,2.2,48040.0,876.0,0.034,...,-0.541689,-0.59148,NO,"Low (<150 cases per 100,000 before Sept. 1, 2020)",-0.332422,-0.35482,NO,Low local incidence,-0.39682,-0.429826
2,2226,112764.0,0.508,14202.0,128.0,29900.0,2.2,48040.0,876.0,0.034,...,-0.541689,-0.59148,NO,"Low (<150 cases per 100,000 before Sept. 1, 2020)",-0.332422,-0.35482,NO,Low local incidence,-0.39682,-0.429826
3,2226,112764.0,0.508,14202.0,128.0,29900.0,2.2,48040.0,876.0,0.034,...,-0.541689,-0.59148,NO,"Low (<150 cases per 100,000 before Sept. 1, 2020)",-0.332422,-0.35482,NO,Low local incidence,-0.39682,-0.429826
4,2226,112764.0,0.508,14202.0,128.0,29900.0,2.2,48040.0,876.0,0.034,...,,,,,,,,,,
