### Exploring some covid data uploaded to a db file using SQLite within Python. 

[Data source](https://ourworldindata.org/covid-deaths)

In [1]:
import sqlite3, csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random 

In [17]:
pd.set_option('display.max_columns', None)

In [4]:
conn = sqlite3.connect("covid_data.db")
cur = conn.cursor()
covid_df = pd.read_csv("//Users/alex/Downloads/owid-covid-data.csv")

In [6]:
covid_df.info(verbose = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254400 entries, 0 to 254399
Columns: 67 entries, iso_code to excess_mortality_cumulative_per_million
dtypes: float64(62), object(5)
memory usage: 130.0+ MB


In [7]:
covid_deaths = pd.concat([covid_df.iloc[:,:25], covid_df.iloc[:,62].to_frame()], axis = 1)

In [8]:
covid_vaccinations = covid_df.drop(covid_df.columns[4:25], axis = 1)

In [18]:
covid_deaths.sample(3)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,population
21513,BEL,Europe,Belgium,2020-03-30,11899.0,1063.0,1165.143,513.0,82.0,60.714,1020.854,91.198,99.961,44.012,7.035,5.209,1.76,1021.0,87.595,4897.0,420.13,,,3717.0,318.894,11655923.0
123554,LAO,Asia,Laos,2021-09-21,19730.0,331.0,292.571,16.0,0.0,0.0,2620.368,43.961,38.857,2.125,0.0,0.0,1.31,,,,,,,,,7529477.0
227689,TGO,Africa,Togo,2021-12-17,26550.0,84.0,25.429,244.0,0.0,0.143,3000.441,9.493,2.874,27.575,0.0,0.016,1.94,,,,,,,,,8848700.0


In [19]:
covid_vaccinations.sample(3)

Unnamed: 0,iso_code,continent,location,date,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
246453,VEN,South America,Venezuela,2021-02-01,,,,,,,,,,,,,,,,,,,,,,,87.96,36.253,29.0,6.614,3.915,16745.022,,204.85,6.47,,,,0.8,72.06,0.711,28301700.0,,,,
65371,EGY,Africa,Egypt,2022-06-11,,,,,,,,,,,,,,,203498.0,,,,,1833.0,16830.0,0.015,34.03,97.999,25.3,5.159,2.891,10550.206,1.3,525.432,17.31,0.2,50.1,89.827,1.6,71.99,0.707,110990096.0,,,,
216661,SDN,Africa,Sudan,2021-01-05,,,,,,,,,,,,,,,,,,,,,,,23.15,23.258,19.7,3.548,2.034,4466.507,,431.388,15.67,,,23.437,0.8,65.31,0.51,46874200.0,,,,


In [20]:
# Get table column names
cur.execute("PRAGMA table_info(covid_deaths)").fetchall()

[(0, 'iso_code', 'TEXT', 0, None, 0),
 (1, 'continent', 'TEXT', 0, None, 0),
 (2, 'location', 'TEXT', 0, None, 0),
 (3, 'date', 'TEXT', 0, None, 0),
 (4, 'total_cases', 'REAL', 0, None, 0),
 (5, 'new_cases', 'REAL', 0, None, 0),
 (6, 'new_cases_smoothed', 'REAL', 0, None, 0),
 (7, 'total_deaths', 'REAL', 0, None, 0),
 (8, 'new_deaths', 'REAL', 0, None, 0),
 (9, 'new_deaths_smoothed', 'REAL', 0, None, 0),
 (10, 'total_cases_per_million', 'REAL', 0, None, 0),
 (11, 'new_cases_per_million', 'REAL', 0, None, 0),
 (12, 'new_cases_smoothed_per_million', 'REAL', 0, None, 0),
 (13, 'total_deaths_per_million', 'REAL', 0, None, 0),
 (14, 'new_deaths_per_million', 'REAL', 0, None, 0),
 (15, 'new_deaths_smoothed_per_million', 'REAL', 0, None, 0),
 (16, 'reproduction_rate', 'REAL', 0, None, 0),
 (17, 'icu_patients', 'REAL', 0, None, 0),
 (18, 'icu_patients_per_million', 'REAL', 0, None, 0),
 (19, 'hosp_patients', 'REAL', 0, None, 0),
 (20, 'hosp_patients_per_million', 'REAL', 0, None, 0),
 (21, 'we

In [21]:
# Fill covid_data.db with covid_deaths Data Frame contents:
covid_deaths.to_sql('covid_deaths', conn, if_exists='replace', index = False)

254400

In [195]:
# Fill covid_data.db with covid_vaccinations Data Frame contents:
covid_vaccinations.to_sql('covid_vaccinations', conn, if_exists='replace', index = False)

254400

In [35]:
# Check to see if the new tables have been added to the the database file
cur.execute("SELECT name FROM sqlite_master ").fetchall()

[('covid_vaccinations',), ('covid_deaths',), ('pop_vaccinated',)]

<font size="3"> Looking at highest total cases/population (grouping by location, in descening order, as a percentage.) </font>

In [25]:
statement = """SELECT location, population, MAX(total_cases), ROUND(MAX(total_cases/population)*100,2) as tot_pop
FROM covid_deaths GROUP BY location ORDER BY tot_pop desc"""

# Fetch only the top 10 results

[print(i,'\n') for i in cur.execute(statement).fetchmany(10)]

#location Population  Max tot cases  Cases/Pop_Percentage 

('Cyprus', 896007.0, 642663.0, 71.73) 

('San Marino', 33690.0, 23427.0, 69.54) 

('Faeroe Islands', 53117.0, 34658.0, 65.25) 

('Austria', 8939617.0, 5785590.0, 64.72) 

('Gibraltar', 32677.0, 20399.0, 62.43) 

('Slovenia', 2119843.0, 1321852.0, 62.36) 

('Brunei', 449002.0, 276067.0, 61.48) 

('Andorra', 79843.0, 47839.0, 59.92) 

('Saint Pierre and Miquelon', 5885.0, 3452.0, 58.66) 

('France', 67813000.0, 39568875.0, 58.35) 



[None, None, None, None, None, None, None, None, None, None]

<font size="3"> Looking at highest total deaths/population (grouping by location, in descening order, as a percentage.) </font>

In [26]:
statement = """SELECT location, population, MAX(cast(total_deaths as int)) as mtd, ROUND(MAX(total_deaths/population)*100,2) as d_p
FROM covid_deaths WHERE Continent is not null GROUP BY location ORDER BY d_p desc"""

# Fetch only the top 10 results
[print(i,'\n') for i in cur.execute(statement).fetchmany(10)]

#location  Population  Max_tot_deaths Max_tot_deaths/pop      

('Peru', 34049588.0, 218931, 0.64) 

('Bulgaria', 6781955.0, 38182, 0.56) 

('Bosnia and Herzegovina', 3233530.0, 16257, 0.5) 

('Hungary', 9967304.0, 48677, 0.49) 

('North Macedonia', 2093606.0, 9641, 0.46) 

('Montenegro', 627082.0, 2797, 0.45) 

('Georgia', 3744385.0, 16926, 0.45) 

('Croatia', 4030361.0, 17883, 0.44) 

('Czechia', 10493990.0, 42312, 0.4) 

('Slovakia', 5643455.0, 20946, 0.37) 



[None, None, None, None, None, None, None, None, None, None]

<font size="3"> Looking at highest total deaths/population, by contintent (grouping by continent, in descening order, as a percentage.)
 </font>

In [27]:
statement = """SELECT continent, population, MAX(cast(total_deaths as int)) as mtd, ROUND(MAX(total_deaths/population)*100,2) as d_p
FROM covid_deaths WHERE Continent is not null GROUP BY continent ORDER BY d_p desc"""

[print(i,'\n') for i in cur.execute(statement).fetchall()]

#location     Population  Max_tot_deaths Max_tot_deaths/pop    

('South America', 34049588.0, 697200, 0.64) 

('Europe', 6781955.0, 387153, 0.56) 

('Asia', 3744385.0, 530741, 0.45) 

('North America', 338289856.0, 1109591, 0.33) 

('Africa', 12356116.0, 102595, 0.24) 

('Oceania', 306292.0, 18615, 0.21) 



[None, None, None, None, None, None]

<font size="3"> 
    
Join two tables (covid deaths and covid vaccinations)

Add a column that keeps a rolling count of the sum of new vaccinations ordered by location and date

Add a column that keeps a rolling count of the percentage of the population vaccinated (using a CTE) </font> 

In [28]:
statement = """WITH pop_vaccinated(continent, location, date, population, new_vaccinations, rolling_sum_of_new_vaccinations)
            AS
            (
            SELECT cd.continent, cd.location, cd.date, cd.population, cv.new_vaccinations,
            SUM(cast(cv.new_vaccinations as int)) OVER (PARTITION BY cd.location ORDER BY cd.location, cd.date)
            AS rolling_sum_of_new_vaccinations
            FROM covid_deaths cd 
            INNER JOIN covid_vaccinations cv ON cd.location = cv.location AND cd.date = cv.date
            WHERE cd.continent IS NOT NULL AND cv.new_vaccinations IS NOT NULL
            ORDER BY cd.location, cd.date 
            )
            
            SELECT *, ROUND((100*(rolling_sum_of_new_vaccinations/population)),3)
            FROM pop_vaccinated"""

# Fetch only the top 15 results
[print(i,'\n') for i in cur.execute(statement).fetchmany(15)]

#continent location      date        population  New_vaccinations rolling_sum_of_new_vaccinations

('Asia', 'Afghanistan', '2021-05-27', 41128772.0, 2859.0, 2859, 0.007) 

('Asia', 'Afghanistan', '2021-06-03', 41128772.0, 4015.0, 6874, 0.017) 

('Asia', 'Afghanistan', '2022-01-27', 41128772.0, 6868.0, 13742, 0.033) 

('Asia', 'Afghanistan', '2022-04-27', 41128772.0, 383.0, 14125, 0.034) 

('Asia', 'Afghanistan', '2022-09-12', 41128772.0, 9447.0, 23572, 0.057) 

('Asia', 'Afghanistan', '2022-11-02', 41128772.0, 36587.0, 60159, 0.146) 

('Asia', 'Afghanistan', '2022-11-16', 41128772.0, 14800.0, 74959, 0.182) 

('Europe', 'Albania', '2021-01-13', 2842318.0, 60.0, 60, 0.002) 

('Europe', 'Albania', '2021-01-14', 2842318.0, 78.0, 138, 0.005) 

('Europe', 'Albania', '2021-01-15', 2842318.0, 42.0, 180, 0.006) 

('Europe', 'Albania', '2021-01-16', 2842318.0, 61.0, 241, 0.008) 

('Europe', 'Albania', '2021-01-17', 2842318.0, 36.0, 277, 0.01) 

('Europe', 'Albania', '2021-01-18', 2842318.0, 42.0, 319, 0.011) 

('Europe', 'Albania', '2021-01-19', 2842318.0, 36.0, 355, 0.012) 

('Europe', 'Alba

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

<font size="3"> Create a view to be able to access this joined table at any time for further querying. I'll call it "pop_vaccinated". </font> 

In [31]:
statement = """CREATE VIEW pop_vaccinated AS
            SELECT cd.continent, cd.location, cd.date, cd.population, cv.new_vaccinations,
            SUM(cast(cv.new_vaccinations as int)) OVER (PARTITION BY cd.location ORDER BY cd.location, cd.date)
            AS rolling_vaccinations
            FROM covid_deaths cd 
            INNER JOIN covid_vaccinations cv ON cd.location = cv.location AND cd.date = cv.date
            WHERE cd.continent IS NOT NULL AND cv.new_vaccinations IS NOT NULL
            """

cur.execute(statement)

<sqlite3.Cursor at 0x124e4f540>

In [32]:
# Test view:
[print(i,'\n') for i in cur.execute("select * from pop_vaccinated").fetchmany(12)]
#cur.execute("DROP VIEW IF EXISTS pop_vaccinated")

('Asia', 'Afghanistan', '2021-05-27', 41128772.0, 2859.0, 2859) 

('Asia', 'Afghanistan', '2021-06-03', 41128772.0, 4015.0, 6874) 

('Asia', 'Afghanistan', '2022-01-27', 41128772.0, 6868.0, 13742) 

('Asia', 'Afghanistan', '2022-04-27', 41128772.0, 383.0, 14125) 

('Asia', 'Afghanistan', '2022-09-12', 41128772.0, 9447.0, 23572) 

('Asia', 'Afghanistan', '2022-11-02', 41128772.0, 36587.0, 60159) 

('Asia', 'Afghanistan', '2022-11-16', 41128772.0, 14800.0, 74959) 

('Europe', 'Albania', '2021-01-13', 2842318.0, 60.0, 60) 

('Europe', 'Albania', '2021-01-14', 2842318.0, 78.0, 138) 

('Europe', 'Albania', '2021-01-15', 2842318.0, 42.0, 180) 

('Europe', 'Albania', '2021-01-16', 2842318.0, 61.0, 241) 

('Europe', 'Albania', '2021-01-17', 2842318.0, 36.0, 277) 



[None, None, None, None, None, None, None, None, None, None, None, None]

<font size="3"> Now I will load the view's contents into a pandas dataframe </font>

In [33]:
vaccines_df = pd.DataFrame(cur.execute("select * from pop_vaccinated").fetchall(), columns = ["Continent", "Location", "Date",
"Population", "New Vaccinations", "Rolling Vaccinations"])

In [34]:
vaccines_df.to_csv('vaccines.csv')