In [1]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt

In [2]:
#all needed data for doing the classification, sources in the README.txt file
with open("CSV_Creation/daily_data_by_country.csv") as f:
    daily_data = pd.read_csv(f,  error_bad_lines=False)

with open('Sources/data_by_country/HEALTH_LVNG_26042020095720197.csv') as f:
    overweight_data = pd.read_csv(f, error_bad_lines=False)

with open('Sources/data_by_country/HEALTH_REAC_26042020094617550.csv') as f:
    hospital_beds_data = pd.read_csv(f, error_bad_lines=False)

with open('Sources/data_by_country/HEALTH_DEMR_26042020093618793.csv') as f:
    elder_people_data = pd.read_csv(f, error_bad_lines=False)

with open('Sources/data_by_country/HEALTH_REAC_26042020101616131.csv') as f:
    health_employment_data = pd.read_csv(f, error_bad_lines=False)

with open('Sources/data_by_country/number-of-covid-19-tests-per-confirmed-case.csv') as f:
    nb_tests_per_confirmed_cases_data = pd.read_csv(f, error_bad_lines=False)

In [3]:
#selection of 'Country' and the value of the features 
overweight_data = overweight_data[['Country', 'Value']]
hospital_beds_data = hospital_beds_data[['Country', 'Value']]
elder_people_data = elder_people_data[['Country', 'Value']]
health_employment_data = health_employment_data[['Country', 'Value']]
nb_tests_per_confirmed_cases_data = nb_tests_per_confirmed_cases_data[['Country', 'Number of tests per confirmed case']]

#some countries are not represented in all datasets. These countries are removed from the classification to avoid missing values
#nb_tests_per_confirmed_cases contain data for all OECD countries 

elder_people_countries_set = set(elder_people_data['Country'])
hospital_beds_countries_set = set(hospital_beds_data['Country'])
overweight_countries_set = set(overweight_data['Country'])
health_employment_countries = set(health_employment_data['Country'])

#thus, the tested countries is given by the intersection of the countries represented in the datasets
tested_countries = hospital_beds_countries_set & elder_people_countries_set & overweight_countries_set & health_employment_countries
tested_countries = list(set(tested_countries))
tested_countries.sort()


In [4]:
#some countries did not have the same writing in all sets 
writing_countries_pb = list(set(tested_countries) - (set(daily_data['Country']) & set(tested_countries)))
writing_countries_pb.sort()
writing_daily_data_pb = ['Czechia', 'Slovakia', 'South Korea', 'US']

for i in range(len(writing_daily_data_pb)):
    daily_data = daily_data.replace(writing_daily_data_pb[i], writing_countries_pb[i])
    nb_tests_per_confirmed_cases_data = nb_tests_per_confirmed_cases_data.replace(writing_daily_data_pb[i], writing_countries_pb[i])


In [5]:
#now, all countries have the same writing, we can select the death rate for each country
last_day_nb = daily_data.values[-1, 0]
death_rate_data = daily_data.loc[(daily_data['Day_number'] == last_day_nb), ['Country','Death_rate']]

In [6]:
#this function creates a list with the last reported feature for each tested country
def creation_list(data, tested_countries):
    l = []
    for country in tested_countries:
        values = data.loc[data['Country'] == country, :].values[:,1]
        l.append(values[-1])
    return l

In [7]:
#getting all lists containing the last reported feature for each tested country
overweight_list = creation_list(overweight_data, tested_countries)
elder_people_list = creation_list(elder_people_data, tested_countries)
health_employment_list = creation_list(health_employment_data, tested_countries)
hospital_beds_list = creation_list(hospital_beds_data, tested_countries)

#all measures are normalized with two digits 
nb_tests_per_confirmed_cases_list = creation_list(nb_tests_per_confirmed_cases_data, tested_countries)
nb_tests_per_confirmed_cases_list = [round(nb_tests, 1) for nb_tests in nb_tests_per_confirmed_cases_list]
death_rate_list = creation_list(death_rate_data, tested_countries)
death_rate_list = [round(rate*100, 1) for rate in death_rate_list]

In [8]:
#creation of the Dataframe
OECD_countries_data_array = np.array([tested_countries, death_rate_list, overweight_list, health_employment_list, hospital_beds_list, elder_people_list, nb_tests_per_confirmed_cases_list]).T
OECD_countries_data = pd.DataFrame(data=OECD_countries_data_array, columns = ['Country','Death_rate', 'Proportion of overweight people', 'Health employment per 1000 hab', 'Hospital beds per 1000 hab', 'Proportion of edler people', 'Number of tests per confirmed cases'])

In [9]:
#writing in a CSV file
with open("CSV_Creation/OECD_countries_data.csv", 'w', newline='') as f:
    spamwriter = csv.writer(f, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow([i for i in OECD_countries_data])
    for i in range(OECD_countries_data.shape[0]):
        spamwriter.writerow(OECD_countries_data.values[i])