# Exam project - Amigos - Covid Business Intelligence

### Imports

In [82]:
import pandas as pd

In [83]:
import numpy as np

In [84]:
import seaborn as sns
import matplotlib.pyplot as plt

In [85]:
from sklearn import metrics
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn import preprocessing as prep
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [86]:
# Uncomment the line below to install the folium package
# %conda install folium

In [87]:
import folium

### Reader

In [88]:
# Reading the file can take a while - it's 86000 rows :-)
data = pd.read_excel(".\Data\CovidDeaths.xlsx")

In [89]:
#show the first and last 5 rows of data

### Cleaner

In [90]:
#List of all tables to be removed
kolonner_til_fjernelse = [
    'new_cases_smoothed', 'total_cases_per_million', 
    'new_cases_per_million', 'new_vaccinations_smoothed', 'new_cases_smoothed_per_million', 'new_deaths_smoothed', 
    'total_deaths_per_million', 'new_deaths_per_million', 'new_deaths_smoothed_per_million', 
    'reproduction_rate', 'icu_patients', 'icu_patients_per_million', 'hosp_patients', 
    'hosp_patients_per_million', 'weekly_icu_admissions', 'weekly_icu_admissions_per_million', 
    'weekly_hosp_admissions', 'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests', 
    'total_tests_per_thousand', 'new_tests_per_thousand', 'new_tests_smoothed', 
    'new_tests_smoothed_per_thousand', 'positive_rate', 'tests_per_case', 'tests_units', 
    'people_vaccinated', 'people_fully_vaccinated', 'total_vaccinations_per_hundred', 
    'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred', 
    'new_vaccinations_smoothed_per_million'
]

# Remove the specified tables
clean = data.drop(kolonner_til_fjernelse, axis=1)

In [91]:
# Set Total vaccinations to 0 if NaN
clean['total_vaccinations'].fillna(0, inplace=True)

In [93]:
# Set Total Deaths to 0 if NaN
clean['total_deaths'].fillna(0, inplace=True)

In [94]:
# Set new Vaccinations to 0 if NaN
clean['new_vaccinations'].fillna(0, inplace=True) 

In [112]:
# Set new Vaccinations to 0 if NaN
clean['new_cases'].fillna(0, inplace=True) 

In [114]:
# Set new Vaccinations to 0 if NaN
clean['new_deaths'].fillna(0, inplace=True) 

In [96]:
# Set Total Cases to 0 if NaN
clean['total_cases'].fillna(0, inplace=True)

In [95]:
# Fix total_vaccinations weirdly high values (numE+09)
# clean[clean['iso_code'].str.contains('OWID_')]['total_vaccinations']

Next we fix OWID(Our World in Data) data, to have the same format as the other data sources.

In [97]:
OWID_Rows = clean[clean['iso_code'].str.contains('OWID_')]
OWID_Rows['continent'].fillna(clean[clean['iso_code'].str.contains('OWID_')]['location'], inplace=True)
OWID_Rows['location'] = 'Whole_Continent'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  OWID_Rows['continent'].fillna(clean[clean['iso_code'].str.contains('OWID_')]['location'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  OWID_Rows['location'] = 'Whole_Continent'


In [107]:
OWID_Rows.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'total_deaths', 'new_deaths', 'total_vaccinations', 'new_vaccinations',
       'stringency_index', 'population', 'population_density', 'median_age',
       'aged_65_older', 'aged_70_older', 'gdp_per_capita', 'extreme_poverty',
       'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers',
       'male_smokers', 'handwashing_facilities', 'hospital_beds_per_thousand',
       'life_expectancy', 'human_development_index'],
      dtype='object')

In [119]:
columns_to_fillna_with_avg = [
    'stringency_index', 'population', 'population_density', 'median_age',
    'aged_65_older', 'aged_70_older', 'gdp_per_capita', 'extreme_poverty',
    'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers',
    'male_smokers', 'handwashing_facilities', 'hospital_beds_per_thousand',
    'life_expectancy', 'human_development_index'
    ]
# there is 9 OWID "continents"
continents = OWID_Rows['continent'].unique()

for continent in continents:
    for column in columns_to_fillna_with_avg:
        OWID_Rows[column].fillna(clean[clean['continent'] == continent][column].mean(), inplace=True)

In [109]:
OWID_Rows

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_vaccinations,new_vaccinations,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
432,OWID_AFR,Africa,Whole_Continent,2020-02-13,0.0,0.0,0.0,0.0,0.000000e+00,0.0,...,5508.122067,33.720818,287.374602,5.647184,2.57387,27.424106,25.493713,1.506984,64.242507,0.562094
433,OWID_AFR,Africa,Whole_Continent,2020-02-14,1.0,1.0,0.0,0.0,0.000000e+00,0.0,...,5508.122067,33.720818,287.374602,5.647184,2.57387,27.424106,25.493713,1.506984,64.242507,0.562094
434,OWID_AFR,Africa,Whole_Continent,2020-02-15,1.0,0.0,0.0,0.0,0.000000e+00,0.0,...,5508.122067,33.720818,287.374602,5.647184,2.57387,27.424106,25.493713,1.506984,64.242507,0.562094
435,OWID_AFR,Africa,Whole_Continent,2020-02-16,1.0,0.0,0.0,0.0,0.000000e+00,0.0,...,5508.122067,33.720818,287.374602,5.647184,2.57387,27.424106,25.493713,1.506984,64.242507,0.562094
436,OWID_AFR,Africa,Whole_Continent,2020-02-17,1.0,0.0,0.0,0.0,0.000000e+00,0.0,...,5508.122067,33.720818,287.374602,5.647184,2.57387,27.424106,25.493713,1.506984,64.242507,0.562094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83964,OWID_WRL,World,Whole_Continent,2021-04-26,147872402.0,682784.0,3120469.0,11156.0,1.045371e+09,16393996.0,...,15469.207000,10.000000,233.070000,8.510000,6.43400,34.635000,60.130000,2.705000,72.580000,0.737000
83965,OWID_WRL,World,Whole_Continent,2021-04-27,148716872.0,844470.0,3134956.0,14487.0,1.065139e+09,19768227.0,...,15469.207000,10.000000,233.070000,8.510000,6.43400,34.635000,60.130000,2.705000,72.580000,0.737000
83966,OWID_WRL,World,Whole_Continent,2021-04-28,149622864.0,905992.0,3150675.0,15719.0,1.087697e+09,22557773.0,...,15469.207000,10.000000,233.070000,8.510000,6.43400,34.635000,60.130000,2.705000,72.580000,0.737000
83967,OWID_WRL,World,Whole_Continent,2021-04-29,150520466.0,897602.0,3165665.0,14990.0,1.112321e+09,24623727.0,...,15469.207000,10.000000,233.070000,8.510000,6.43400,34.635000,60.130000,2.705000,72.580000,0.737000


In [110]:
# Add OWID data back into DataFrame
clean = clean.drop(OWID_Rows.index)
clean = pd.concat([clean, OWID_Rows], ignore_index=False)

In [136]:
# Remove countries with almost no data
countries_to_remove = [
    'Anguilla', 'Vatican', 'Antigua and Barbuda',
    'Cayman Islands', 'Curacao', 'Dominica', 
    'Falkland Islands', 'Guernsey', 'Isle of Man', 
    'Jersey', 'Liechtenstein', 'Marshall Islands',
    'Montserrat', 'Northern Cyprus', 'Saint Kitts and Nevis', 
    'Saint Helena', 'Saint Lucia', 'Saint Vincent and the Grenadines',
    'Turks and Caicos Islands', 
    ]
clean = clean[~clean['location'].isin(countries_to_remove)]

In [137]:
# fillna stringency_index foreach country with the mean of the stringency_index for the country
countries_names = clean['location'].unique()
for country in countries_names:
    contry_rows = clean[clean['location'] == country]
    clean = clean.drop(contry_rows.index)
    mean = contry_rows['stringency_index'].mean()
    cleaned = contry_rows.fillna(mean, inplace=False)
    clean = pd.concat([clean, cleaned], ignore_index=False)


In [138]:
# Set the rest of stringency_index nan to 0
clean['stringency_index'].fillna(0, inplace=True)

In [142]:
#Vi mangler noget her :))
clean.isnull().sum()

iso_code                         0
continent                        0
location                         0
date                             0
total_cases                      0
new_cases                        0
total_deaths                     0
new_deaths                       0
total_vaccinations               0
new_vaccinations                 0
stringency_index                 0
population                       0
population_density               0
median_age                     119
aged_65_older                  119
aged_70_older                  119
gdp_per_capita                 111
extreme_poverty               1519
cardiovasc_death_rate          119
diabetes_prevalence            111
female_smokers                2250
male_smokers                  2250
handwashing_facilities        1628
hospital_beds_per_thousand    1196
life_expectancy                  0
human_development_index        119
dtype: int64

In [102]:
#Removing the rows with missing values

In [103]:
#Replace all the countries with only a name, but no value

In [92]:
# Save the updated sheet
# clean.to_excel(".\Data\CovidDeaths_cleaned.xlsx", index=False)

### Methods

In [104]:
#Removing all duplicates (if any)
def removeDuplicates(df):
    df.drop_duplicates(inplace=True)
    return df

### Exploring the data

In [105]:
# hvor mange kolonner/rækker har vi?
data.shape

(85171, 59)

In [106]:
#samlet shape når vi har aggregated to data sæt (data + et given lands data)
#showShape(combined_data)