# Uppgift 2 - uppvärmning vaccindata

In [7]:
import pandas as pd
import plotly_express as px

# setting up file variables
file_path = "Data/Folkhalsomyndigheten_Covid19_Vaccine.xlsx"
sheet = "Vaccinerade kommun och ålder"

# reading file to a dataframe
df = pd.read_excel(file_path, sheet_name = sheet, usecols = "B,D:J")

df.head()

Unnamed: 0,Län_namn,Kommun_namn,Ålder,Befolkning,Antal minst 1 dos,Antal minst 2 doser,Antal 3 doser,Antal 4 doser
0,Stockholms län,Upplands Väsby,12-15,2422,1206,1046,,
1,Stockholms län,Upplands Väsby,16-17,1203,839,755,,
2,Stockholms län,Upplands Väsby,18-29,6692,4887,4469,1959.0,
3,Stockholms län,Upplands Väsby,30-39,7332,5542,5240,2878.0,
4,Stockholms län,Upplands Väsby,40-49,6946,5592,5429,3719.0,


In [8]:
df.info() # note some missing values (for doses not yet distributed to certain age groups), and age ranges are not numbers
# nothing that needs to be changed right now, but important to take not of for later in case they need to be changed for a specific use

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Län_namn             2900 non-null   object 
 1   Kommun_namn          2900 non-null   object 
 2   Ålder                2900 non-null   object 
 3   Befolkning           2900 non-null   int64  
 4   Antal minst 1 dos    2900 non-null   int64  
 5   Antal minst 2 doser  2900 non-null   int64  
 6   Antal 3 doser        2320 non-null   float64
 7   Antal 4 doser        870 non-null    float64
dtypes: float64(2), int64(3), object(3)
memory usage: 181.4+ KB


---
## Formatting DataFrame to be used in future

In [9]:
# # renaming from county to region (replacing "s län" and " län" endings)
df["Län_namn"].replace({'s län$': '', ' län$': ''}, regex = True, inplace = True)
df.head(1)

Unnamed: 0,Län_namn,Kommun_namn,Ålder,Befolkning,Antal minst 1 dos,Antal minst 2 doser,Antal 3 doser,Antal 4 doser
0,Stockholm,Upplands Väsby,12-15,2422,1206,1046,,


In [10]:
# renaming columns
df.rename(columns={"Län_namn": "Region", "Kommun_namn": "Kommun", "Antal 3 doser": "Antal minst 3 doser", "Antal 4 doser": "Antal minst 4 doser"}, inplace = True)
df.head(1)

Unnamed: 0,Region,Kommun,Ålder,Befolkning,Antal minst 1 dos,Antal minst 2 doser,Antal minst 3 doser,Antal minst 4 doser
0,Stockholm,Upplands Väsby,12-15,2422,1206,1046,,


In [11]:
# saving df for future use
file_path = "Data/Vaccinerade_kommun_och_ålder.xlsx"

df.to_excel(file_path, index = False)

---
## a) How many counties are represented in the dataset?

In [12]:
counties = len(df["Region"].unique()) # length (amount) unique strings in Län_namn
print(f"{counties} counties are represented in the dataset")

21 counties are represented in the dataset


In [13]:
df["Region"].unique() # no duplicates with different spelling (typos) in the set at a manual overview

array(['Stockholm', 'Uppsala', 'Södermanland', 'Östergötland',
       'Jönköping', 'Kronoberg', 'Kalmar', 'Gotland', 'Blekinge', 'Skåne',
       'Halland', 'Västra Götaland', 'Värmland', 'Örebro', 'Västmanland',
       'Dalarna', 'Gävleborg', 'Västernorrland', 'Jämtland',
       'Västerbotten', 'Norrbotten'], dtype=object)

---
## b) How many cities are represented in the dataset?

In [14]:
cities = len(df["Kommun"].unique()) # length (amount) unique strings in Kommun_namn
print(f"{cities} cities are represented in the dataset")

290 cities are represented in the dataset


In [15]:
df["Kommun"].unique() # can't spot any obvious duplicates with different spelling (typos) at a quick manual overview

array(['Upplands Väsby', 'Vallentuna', 'Österåker', 'Värmdö', 'Järfälla',
       'Ekerö', 'Huddinge', 'Botkyrka', 'Salem', 'Haninge', 'Tyresö',
       'Upplands-Bro', 'Nykvarn', 'Täby', 'Danderyd', 'Sollentuna',
       'Stockholm', 'Södertälje', 'Nacka', 'Sundbyberg', 'Solna',
       'Lidingö', 'Vaxholm', 'Norrtälje', 'Sigtuna', 'Nynäshamn', 'Håbo',
       'Älvkarleby', 'Knivsta', 'Heby', 'Tierp', 'Uppsala', 'Enköping',
       'Östhammar', 'Vingåker', 'Gnesta', 'Nyköping', 'Oxelösund', 'Flen',
       'Katrineholm', 'Eskilstuna', 'Strängnäs', 'Trosa', 'Ödeshög',
       'Ydre', 'Kinda', 'Boxholm', 'Åtvidaberg', 'Finspång',
       'Valdemarsvik', 'Linköping', 'Norrköping', 'Söderköping', 'Motala',
       'Vadstena', 'Mjölby', 'Aneby', 'Gnosjö', 'Mullsjö', 'Habo',
       'Gislaved', 'Vaggeryd', 'Jönköping', 'Nässjö', 'Värnamo', 'Sävsjö',
       'Vetlanda', 'Eksjö', 'Tranås', 'Uppvidinge', 'Lessebo', 'Tingsryd',
       'Alvesta', 'Älmhult', 'Markaryd', 'Växjö', 'Ljungby', 'Högsby',
       '

---
## c) What size is the population represented in the dataset?

In [16]:
# sum of all values under Befolkning
print(f"{df['Befolkning'].sum():,} is the size of the population represented in the dataset")

9,092,790 is the size of the population represented in the dataset


---
## d) Calculate amount of children in Sweden under 18 based on this dataset

In [17]:
amount_children_12_17 = df["Befolkning"][df["Ålder"] < "18"].sum() # comparison with string value idea from https://github.com/VineelaNedunuri
print(f"{amount_children_12_17:,} children under 18 according to this dataset (with limited data for only ages 12-17)")

745,370 children under 18 according to this dataset (with limited data for only ages 12-17)


### d.2) Getting estimate values to account for gap in dataset

In [18]:
# assuming roughly equal distribution of children in ages 0-17

# calculating estimated pop size under 18 based on this dataset:
pop_per_age = amount_children_12_17 / 6 # age group 12-17 has 6 ages in it, therefore dividing its sum by 6 to get estimate of population size per age
est_pop_under_18 = pop_per_age * 18 # age group 0-17 has 18 ages in it, therefore multiplying pop per age estimate with 18 to get estimate of population size under 18
print(f"Amount of children aged 0-17 should be roughly {round(est_pop_under_18):,} based on this dataset")

Amount of children aged 0-17 should be roughly 2,236,110 based on this dataset


### Getting online data with actual values for comparison:

In [19]:
# reading in downloaded data on population of sweden year 2021 according to SCB which is a good credible source
# (source: https://www.statistikdatabasen.scb.se/pxweb/sv/ssd/START__BE__BE0101__BE0101A/FolkmangdNov/)
file_path = "Data/SCB_sveriges_befolkning_2021.xlsx"

df_befolkning = pd.read_excel(file_path, header = 2, usecols = "B:D", names = ["Ålder", "Kön", "Befolkning"], nrows = 202)
# using nrows to read 202 rows since header 2 is used, which gets us data for 100 year groups, by 2 sexes
# this gives us a dataframe of ages 0-100 for men and women

df_befolkning # men and women are on separate rows, and years are half nans

Unnamed: 0,Ålder,Kön,Befolkning
0,0 år,män,50261
1,,kvinnor,47919
2,1 år,män,59092
3,,kvinnor,55760
4,2 år,män,60536
...,...,...,...
197,,kvinnor,2145
198,99 år,män,395
199,,kvinnor,1462
200,100+ år,män,481


In [20]:
# ages has 101 null values, which is exactly every second value, and the other series don't have any null values
# ages are also strings instead of ints
df_befolkning.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202 entries, 0 to 201
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Ålder       101 non-null    object
 1   Kön         202 non-null    object
 2   Befolkning  202 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 4.9+ KB


In [21]:
# to solve nans we need to fill every second value with above value for ages
df_befolkning["Ålder"].fillna(method = "ffill", inplace = True) # using fillna on age series to fill values, method = ffill fills nans with whatever is above them
df_befolkning

Unnamed: 0,Ålder,Kön,Befolkning
0,0 år,män,50261
1,0 år,kvinnor,47919
2,1 år,män,59092
3,1 år,kvinnor,55760
4,2 år,män,60536
...,...,...,...
197,98 år,kvinnor,2145
198,99 år,män,395
199,99 år,kvinnor,1462
200,100+ år,män,481


In [22]:
# replacing all " år" and "+", and overwriting series Ålder with replaced version as ints, note that inplace cannot be used here for astype to work
df_befolkning["Ålder"] = df_befolkning["Ålder"].replace({' år': '', '\+': ''}, regex = True).astype(int)
df_befolkning

Unnamed: 0,Ålder,Kön,Befolkning
0,0,män,50261
1,0,kvinnor,47919
2,1,män,59092
3,1,kvinnor,55760
4,2,män,60536
...,...,...,...
197,98,kvinnor,2145
198,99,män,395
199,99,kvinnor,1462
200,100,män,481


In [23]:
# age now has nans filled and reformatted to ints
df_befolkning.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202 entries, 0 to 201
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Ålder       202 non-null    int32 
 1   Kön         202 non-null    object
 2   Befolkning  202 non-null    int64 
dtypes: int32(1), int64(1), object(1)
memory usage: 4.1+ KB


In [24]:
# boys and girls are listed on separate rows in this dataset, therefore we need to slice by ages*2, and +2 on the high end to be inclusive of the last age
# getting population of age 0-17 should then be slicing from 0-36
df_befolkning[32:36] # results in up to and including age 17

Unnamed: 0,Ålder,Kön,Befolkning
32,16,män,60826
33,16,kvinnor,57618
34,17,män,61083
35,17,kvinnor,57082


In [25]:
act_pop_under_18 = df_befolkning['Befolkning'][0:36].sum()
print(f"Actual amount of children aged 0-17 is {act_pop_under_18:,}")

Actual amount of children aged 0-17 is 2,179,896


### Calculating difference between actual data and estimation based off dataset

In [26]:
diff = (est_pop_under_18 - act_pop_under_18) / est_pop_under_18 # calculating difference
print(f"Difference between dataset estimation and actual numbers is ~{diff * 100:.1f}%") # a difference of ~2.5%

Difference between dataset estimation and actual numbers is ~2.5%


---
## e) Draw a diagram showing the age distribution of Swedens population

### Plotting based on original dataset 
Note that this results in misleading visuals:  
Due to age groups being unevenly sized (16-17 for example having only 2 ages in it, while 18-29 has 12) age curve ends up looking more uneven than it is  
The dataset is also not accounting for population size of ages 0-11

In [27]:
px.bar(
    df.groupby(["Ålder"]).sum(numeric_only = True), # using df grouped by age to get the sum of all numeric values based on that group
    y = "Befolkning",
    title="Åldersditribution av Sveriges Population",
    labels={"Befolkning": "Antal Individer", "Ålder": "Åldersgrupp"},
)

### Plotting based on external dataset:  
Evenly sized age groups (groups of 1)  
Accounts for all ages  

This leads to a more readable and representable graph of the actual population size  
As a bonus we can also read the population size based on gender  

In [28]:
fig = px.bar(
    df_befolkning,
    x="Ålder",
    y="Befolkning",
    title="Åldersditribution av Sveriges Population",
    labels={"Befolkning": "Antal Individer"},
    color = "Kön"
)

fig.write_html("Visualiseringar/E2E_aldersdistribution.html")
fig.show()


---
## f) Rita stapeldiagram för andel med minst 1/2/3 doser per län

In [29]:
län_grp = df.groupby(["Region"]) # grouping by region
län_grp.get_group("Stockholm") # looks like so:

Unnamed: 0,Region,Kommun,Ålder,Befolkning,Antal minst 1 dos,Antal minst 2 doser,Antal minst 3 doser,Antal minst 4 doser
0,Stockholm,Upplands Väsby,12-15,2422,1206,1046,,
1,Stockholm,Upplands Väsby,16-17,1203,839,755,,
2,Stockholm,Upplands Väsby,18-29,6692,4887,4469,1959.0,
3,Stockholm,Upplands Väsby,30-39,7332,5542,5240,2878.0,
4,Stockholm,Upplands Väsby,40-49,6946,5592,5429,3719.0,
...,...,...,...,...,...,...,...,...
255,Stockholm,Nynäshamn,50-59,4135,3706,3631,3019.0,
256,Stockholm,Nynäshamn,60-69,3341,3088,3043,2765.0,
257,Stockholm,Nynäshamn,70-79,3311,3163,3125,2973.0,2390.0
258,Stockholm,Nynäshamn,80-89,1530,1488,1475,1425.0,1224.0


In [30]:
# creating a new dataframe instead of adding columns to original, for more readable and easy to follow code when plotting
df_andel = pd.DataFrame()
df_andel["Minst 1 dos"] = län_grp["Antal minst 1 dos"].sum() / län_grp["Befolkning"].sum()
df_andel["Minst 2 doser"] = län_grp["Antal minst 2 doser"].sum() / län_grp["Befolkning"].sum()
df_andel["Minst 3 doser"] = län_grp["Antal minst 3 doser"].sum() / län_grp["Befolkning"].sum()
df_andel.head()

Unnamed: 0_level_0,Minst 1 dos,Minst 2 doser,Minst 3 doser
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Blekinge,0.879227,0.866501,0.662176
Dalarna,0.878389,0.864858,0.651774
Gotland,0.904699,0.888844,0.693995
Gävleborg,0.873811,0.853503,0.632934
Halland,0.876481,0.863581,0.649378


### Plotting without grouping bars
Hard to read with doses stacked on top of each other  
Can be misread as 250% of population being vaccinated  

It does however give an indication of average amount of doses in the different regions

In [31]:
px.bar(
    df_andel,
    title="Andel Invånare per Län Vaccinerade med minst 1/2/3 Doser",
    labels={"value": "Andel Invånare", "variable": "Vaccinationer"},
)


### Plotting with grouped bars
Gives a much clearer view of specific doses compared to other regions

In [32]:
fig = px.bar(
    df_andel,
    barmode="group",
    title="Andel Invånare per Län Vaccinerade med minst 1/2/3 Doser",
    labels={"value": "Andel Invånare", "variable": "Vaccinationer"},
)

fig.write_html("Visualiseringar/E2F_andel_vaccinationer_region.html")
fig.show()

---
## f) Rita diagram över andel med minst 1/2/3/4 doser i Västra Götaland och Stockholm

In [33]:
# adding 4th dose to dataframe
df_andel["Minst 4 doser"] = län_grp["Antal minst 4 doser"].sum() / län_grp["Befolkning"].sum()
df_andel.head()

Unnamed: 0_level_0,Minst 1 dos,Minst 2 doser,Minst 3 doser,Minst 4 doser
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Blekinge,0.879227,0.866501,0.662176,0.179857
Dalarna,0.878389,0.864858,0.651774,0.194291
Gotland,0.904699,0.888844,0.693995,0.201858
Gävleborg,0.873811,0.853503,0.632934,0.180964
Halland,0.876481,0.863581,0.649378,0.171083


In [34]:
# Plotting a transposed version of dataframe, picking out the 2 regions requested
# Props to https://github.com/hakanAkerblom/ for the idea to use transpose here
fig = px.bar(
    df_andel.transpose()[["Västra Götaland", "Stockholm"]],
    barmode="group",
    title="Andel Invånare i Västra Götaland och Stockholm Vaccinerade med minst 1/2/3/4 Doser",
    labels={
        "value": "Andel Invånare",
        "index": "Amount of Doses (by Region)",
    }
)

fig.write_html("Visualiseringar/E2F2_andel_vaccinerade_vg_sthlm.html")
fig.show()
