# Exam project - Amigos - Covid Business Intelligence

### Imports

In [16]:
import pandas as pd

In [17]:
import numpy as np

In [18]:
import seaborn as sns
import matplotlib.pyplot as plt

In [19]:
from sklearn import metrics
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn import preprocessing as prep
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [20]:
# Uncomment the line below to install the folium package
# %conda install folium

In [21]:
import folium

### Reader

In [22]:
# Reading the file can take a while - it's 86000 rows :-)
data = pd.read_excel(".\Data\CovidDeaths.xlsx")

In [23]:
#show the first and last 5 rows of data

### Cleaner

In [61]:
#List of all tables to be removed
kolonner_til_fjernelse = [
    'new_cases', 'new_cases_smoothed', 'total_cases_per_million', 
    'new_cases_per_million', 'new_cases_smoothed_per_million', 'new_deaths_smoothed', 
    'total_deaths_per_million', 'new_deaths_per_million', 'new_deaths_smoothed_per_million', 
    'reproduction_rate', 'icu_patients', 'icu_patients_per_million', 'hosp_patients', 
    'hosp_patients_per_million', 'weekly_icu_admissions', 'weekly_icu_admissions_per_million', 
    'weekly_hosp_admissions', 'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests', 
    'total_tests_per_thousand', 'new_tests_per_thousand', 'new_tests_smoothed', 
    'new_tests_smoothed_per_thousand', 'positive_rate', 'tests_per_case', 'tests_units', 
    'people_vaccinated', 'people_fully_vaccinated', 'total_vaccinations_per_hundred', 
    'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred', 
    'new_vaccinations_smoothed_per_million'
]

# Remove the specified tables
clean = data.drop(kolonner_til_fjernelse, axis=1)

In [None]:
# Save the updated sheet
clean.to_excel(".\Data\CovidDeaths_cleaned.xlsx", index=False)

Next we fix OWID(Our World in Data) data, to have the same format as the other data sources.

In [62]:
OWID_Rows = clean[clean['iso_code'].str.contains('OWID_')]
OWID_Rows['continent'].fillna(clean[clean['iso_code'].str.contains('OWID_')]['location'], inplace=True)
OWID_Rows['location'] = 'Whole_Continent'
clean = clean.drop(OWID_Rows.index)
clean = pd.concat([clean, OWID_Rows], ignore_index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  OWID_Rows['continent'].fillna(clean[clean['iso_code'].str.contains('OWID_')]['location'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  OWID_Rows['location'] = 'Whole_Continent'


Unnamed: 0,iso_code,continent,location,date,total_cases,total_deaths,new_deaths,total_vaccinations,new_vaccinations,new_vaccinations_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
432,OWID_AFR,Africa,Whole_Continent,2020-02-13,,,0.0,,,,...,,,,,,,,,,
433,OWID_AFR,Africa,Whole_Continent,2020-02-14,1.0,,0.0,,,,...,,,,,,,,,,
434,OWID_AFR,Africa,Whole_Continent,2020-02-15,1.0,,0.0,,,,...,,,,,,,,,,
435,OWID_AFR,Africa,Whole_Continent,2020-02-16,1.0,,0.0,,,,...,,,,,,,,,,
436,OWID_AFR,Africa,Whole_Continent,2020-02-17,1.0,,0.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83964,OWID_WRL,World,Whole_Continent,2021-04-26,147872402.0,3120469.0,11156.0,1.045371e+09,16393996.0,16976749.0,...,15469.207,10.0,233.07,8.51,6.434,34.635,60.13,2.705,72.58,0.737
83965,OWID_WRL,World,Whole_Continent,2021-04-27,148716872.0,3134956.0,14487.0,1.065139e+09,19768227.0,18089518.0,...,15469.207,10.0,233.07,8.51,6.434,34.635,60.13,2.705,72.58,0.737
83966,OWID_WRL,World,Whole_Continent,2021-04-28,149622864.0,3150675.0,15719.0,1.087697e+09,22557773.0,18691784.0,...,15469.207,10.0,233.07,8.51,6.434,34.635,60.13,2.705,72.58,0.737
83967,OWID_WRL,World,Whole_Continent,2021-04-29,150520466.0,3165665.0,14990.0,1.112321e+09,24623727.0,19064020.0,...,15469.207,10.0,233.07,8.51,6.434,34.635,60.13,2.705,72.58,0.737


In [None]:
#Vi mangler noget her :))
clean.isnull().sum()

continent                      4111
location                          0
date                              0
total_cases                    2099
total_deaths                  11763
new_deaths                    11605
total_vaccinations            75797
new_vaccinations              77217
new_vaccinations_smoothed     70079
stringency_index              12964
population                      549
population_density             5897
median_age                     8465
aged_65_older                  9341
aged_70_older                  8895
gdp_per_capita                 8125
extreme_poverty               32722
cardiovasc_death_rate          7537
diabetes_prevalence            6392
female_smokers                24343
male_smokers                  25240
handwashing_facilities        46164
hospital_beds_per_thousand    14324
life_expectancy                4338
human_development_index        7654
dtype: int64

In [None]:
#Removing the rows with missing values

In [None]:
#Replace all the countries with only a name, but no value

### Methods

In [None]:
#Removing all duplicates (if any)
def removeDuplicates(df):
    df.drop_duplicates(inplace=True)
    return df

### Exploring the data

In [None]:
# hvor mange kolonner/rækker har vi?
data.shape

In [None]:
#samlet shape når vi har aggregated to data sæt (data + et given lands data)
#showShape(combined_data)