# Problem Statement

### 1. Global COVID-19 Overview:
For this question, you would likely need columns related to global COVID-19 statistics, such as "date," "total_cases," "total_deaths," and "total_recovered."

### 2. Regional Impact:
To analyze regional impact, you would need columns that identify the region or country, such as "location." You may also need columns like "new_cases" and "new_deaths" to calculate daily figures.

### 3. Vaccination Progress:
To track vaccination progress, you would need columns related to vaccine administration, such as "total_vaccinations" and "people_vaccinated." The "location" column is also important for differentiating regions.

### 4. Time Trends:
Time trends analysis would require columns like "date," "new_cases," and "new_deaths" to show changes over time.

### 5. Demographic Analysis:
Demographic analysis might involve columns like "age_group," "gender," and "total_cases" or "total_deaths" broken down by these categories.

In [14]:
# Importing libraries which will help us getting data from the web and store the same for further processing
import pandas as pd
import numpy as np
import requests
from io import StringIO

In [15]:
# Define the URL of the CSV file
url = "https://covid.ourworldindata.org/data/owid-covid-data.csv"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Read the content of the response into a DataFrame
    new_df = pd.read_csv(StringIO(response.text))
    print("Data downloaded and saved as 'new_df'")
else:
    print("Failed to download data")
#By importing StringIO from the io module and using it as shown in the corrected code, you should be able to read the CSV content from the HTTP response without encountering the attribute error

Data downloaded and saved as 'new_df'


In [16]:
new_df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-03,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-01-04,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-01-05,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-01-06,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-01-07,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341308,ZWE,Africa,Zimbabwe,2023-09-09,265737.0,0.0,0.000,5717.0,0.0,0.000,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
341309,ZWE,Africa,Zimbabwe,2023-09-10,265742.0,5.0,0.714,5718.0,1.0,0.143,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
341310,ZWE,Africa,Zimbabwe,2023-09-11,265742.0,0.0,0.714,5718.0,0.0,0.143,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
341311,ZWE,Africa,Zimbabwe,2023-09-12,265742.0,0.0,0.714,5718.0,0.0,0.143,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,


In [17]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 341313 entries, 0 to 341312
Data columns (total 67 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   iso_code                                    341313 non-null  object 
 1   continent                                   325089 non-null  object 
 2   location                                    341313 non-null  object 
 3   date                                        341313 non-null  object 
 4   total_cases                                 303548 non-null  float64
 5   new_cases                                   331893 non-null  float64
 6   new_cases_smoothed                          330634 non-null  float64
 7   total_deaths                                282171 non-null  float64
 8   new_deaths                                  331938 non-null  float64
 9   new_deaths_smoothed                         330708 non-null  float64
 

In [18]:
new_df['date'] = pd.to_datetime(new_df['date'])


In [19]:
new_df['date']

0        2020-01-03
1        2020-01-04
2        2020-01-05
3        2020-01-06
4        2020-01-07
            ...    
341308   2023-09-09
341309   2023-09-10
341310   2023-09-11
341311   2023-09-12
341312   2023-09-13
Name: date, Length: 341313, dtype: datetime64[ns]

In [20]:
# Creating a list of columns which are not needed for our final dashboard
columns_to_drop = ['iso_code',
 'new_cases_smoothed',
 'new_deaths',
 'new_deaths_smoothed',
 'total_cases_per_million',
 'new_cases_per_million',
 'new_cases_smoothed_per_million',
 'total_deaths_per_million',
 'new_deaths_per_million',
 'new_deaths_smoothed_per_million',
 'reproduction_rate',
 'icu_patients_per_million',
 'hosp_patients_per_million',
 'weekly_icu_admissions',
 'weekly_icu_admissions_per_million',
 'weekly_hosp_admissions',
 'weekly_hosp_admissions_per_million',
 'total_tests_per_thousand',
 'new_tests_per_thousand',
 'new_tests_smoothed',
 'new_tests_smoothed_per_thousand',
 'tests_per_case',
 'tests_units',
 'new_vaccinations_smoothed',
 'total_vaccinations_per_hundred',
 'people_vaccinated_per_hundred',
 'people_fully_vaccinated_per_hundred',
 'total_boosters_per_hundred',
 'new_vaccinations_smoothed_per_million',
 'new_people_vaccinated_smoothed',
 'new_people_vaccinated_smoothed_per_hundred',
 'stringency_index',
 'median_age',
 'aged_65_older',
 'aged_70_older',
 'extreme_poverty',
 'cardiovasc_death_rate',
 'handwashing_facilities',
 'hospital_beds_per_thousand',
 'life_expectancy',
 'human_development_index',
 'excess_mortality_cumulative_absolute',
 'excess_mortality_cumulative',
 'excess_mortality',
 'excess_mortality_cumulative_per_million']

In [21]:
# Dropping unwanted columns from out dataset
new_df.drop(columns_to_drop,axis=1, inplace=True)

In [22]:
# Lets check out the new remaining columns
new_df.columns

Index(['continent', 'location', 'date', 'total_cases', 'new_cases',
       'total_deaths', 'icu_patients', 'hosp_patients', 'total_tests',
       'new_tests', 'positive_rate', 'total_vaccinations', 'people_vaccinated',
       'people_fully_vaccinated', 'total_boosters', 'new_vaccinations',
       'population_density', 'gdp_per_capita', 'diabetes_prevalence',
       'female_smokers', 'male_smokers', 'population'],
      dtype='object')

* We will deal with the Null values in Microsoft Power Bi. So, lets save the data in a csv file which can be further connected to power bi for reports and dashboards generation.

In [23]:
# Checking the null values
null_counts = new_df.isnull().sum()
null_counts

continent                   16224
location                        0
date                            0
total_cases                 37765
new_cases                    9420
total_deaths                59142
icu_patients               303932
hosp_patients              302814
total_tests                261926
new_tests                  265910
positive_rate              245386
total_vaccinations         262858
people_vaccinated          266218
people_fully_vaccinated    269585
total_boosters             294550
new_vaccinations           276723
population_density          51628
gdp_per_capita              77278
diabetes_prevalence         63223
female_smokers             142833
male_smokers               145533
population                      0
dtype: int64

* Step 1 of Dealing with NUll : In all those columns where rows does not contain data which is the sum/aggregate of all previous rows (new_cases, new_tests), we will replace Null Values with zero.

In [24]:
# Identifying the columns where I want to replace NaN with zero
columns_to_replace_with_zero = ['new_cases', 'icu_patients', 'hosp_patients', 'new_tests','positive_rate', 'new_vaccinations',
                      'population_density','gdp_per_capita','diabetes_prevalence','female_smokers','male_smokers']
# Creating a dictionary to specify replacement values for each column
replacement_values = {col: 0 for col in columns_to_replace_with_zero}

# Using the fillna method with the dictionary to replace NaN with zero in specific columns
new_df.fillna(replacement_values, inplace=True)

# creating a variable for new null counts affter transformation of data
new_null_counts = new_df.isnull().sum()

# Printing the resulting DataFrame
print(new_null_counts)

continent                   16224
location                        0
date                            0
total_cases                 37765
new_cases                       0
total_deaths                59142
icu_patients                    0
hosp_patients                   0
total_tests                261926
new_tests                       0
positive_rate                   0
total_vaccinations         262858
people_vaccinated          266218
people_fully_vaccinated    269585
total_boosters             294550
new_vaccinations                0
population_density              0
gdp_per_capita                  0
diabetes_prevalence             0
female_smokers                  0
male_smokers                    0
population                      0
dtype: int64


* Step 2 of Dealing with NULL : In all those columns where rows contain data which is the sum/aggregate of all previous rows (total_cases, total_tests), we will replace Null Values with last available data. 

(For ex. if on date 15-09-2023 the value of total_cases for a particular country is 100000, and on date 16-09-2023 the value of total_cases is NULL, then we want this NULL value to be replaced with the value '100000')

In [25]:
# Identifying the columns where you want to replace NaN with the most recent date's value with in the same location
columns_to_replace_with_ffill = ['total_cases','total_deaths','total_tests','total_vaccinations','people_vaccinated',
                                        'people_fully_vaccinated','total_boosters','population']

# Sorting the DataFrame by date and location in descending order
for col in columns_to_replace_with_ffill:
    new_df[col].fillna(method='ffill', inplace=True)

# Fill NaN values with 0 when no previous value is available within a location
new_df[columns_to_replace_with_ffill] = new_df.groupby('location')[columns_to_replace_with_ffill].transform(lambda x: x.fillna(0))

# Sort the DataFrame back to its original order
new_df.sort_values(by=['location', 'date'], ascending=[True, True], inplace=True)

# creating variable to check sum of null value 
new_null_counts3 = new_df.isnull().sum()

# Printing the resulting DataFrame
print(new_null_counts3)

continent                  16224
location                       0
date                           0
total_cases                    0
new_cases                      0
total_deaths                   0
icu_patients                   0
hosp_patients                  0
total_tests                    0
new_tests                      0
positive_rate                  0
total_vaccinations             0
people_vaccinated              0
people_fully_vaccinated        0
total_boosters                 0
new_vaccinations               0
population_density             0
gdp_per_capita                 0
diabetes_prevalence            0
female_smokers                 0
male_smokers                   0
population                     0
dtype: int64


In [26]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 341313 entries, 0 to 341312
Data columns (total 22 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   continent                325089 non-null  object        
 1   location                 341313 non-null  object        
 2   date                     341313 non-null  datetime64[ns]
 3   total_cases              341313 non-null  float64       
 4   new_cases                341313 non-null  float64       
 5   total_deaths             341313 non-null  float64       
 6   icu_patients             341313 non-null  float64       
 7   hosp_patients            341313 non-null  float64       
 8   total_tests              341313 non-null  float64       
 9   new_tests                341313 non-null  float64       
 10  positive_rate            341313 non-null  float64       
 11  total_vaccinations       341313 non-null  float64       
 12  people_vaccinate

In [27]:
pip install pandas sqlalchemy pyodbc

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [28]:
import pyodbc

# Define the SQL Server connection details
server = 'ANKUR_HP'       # Replace with your SQL Server instance name or IP address
database = 'covid_19_dashboard_database'   # Replace with your database name

# Create a connection string with Windows Authentication
conn_str = f'DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};Trusted_Connection=yes;'

# Establish the database connection
conn = pyodbc.connect(conn_str)

# Replace 'your_table_name' with the desired table name
table_name = 'table_for_covid_19_dashboard'

In [29]:
try:
    # Establish the database connection
    conn = pyodbc.connect(conn_str)
    
    # Create a cursor
    cursor = conn.cursor()
    
    # Fetch the list of table names in the current database
    table_names = cursor.tables(tableType='TABLE')
    
    # Print the list of table names
    for table in table_names:
        print(table.table_name)

    # Close the cursor and the connection
    #cursor.close()
    #conn.close()

except Exception as e:
    print(f'Error: {str(e)}')

table_for_covid_19_dashboard
trace_xe_action_map
trace_xe_event_map


In [30]:
from sqlalchemy import create_engine

# Define the SQL Server connection details
server = 'ANKUR_HP'       # Replace with your SQL Server instance name or IP address
database = 'covid_19_dashboard_database'   # Replace with your database name

# Create a connection string with Windows Authentication
conn_str = f'mssql+pyodbc://@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server'

# Create a database engine
engine = create_engine(conn_str)

# Replace 'your_table_name' with the desired table name
table_name = 'table_for_covid_19_dashboard'

try:
    # Save the DataFrame to the SQL table using the SQLAlchemy engine
    new_df.to_sql(table_name, con=engine, if_exists='replace', index=False)
    print(f'DataFrame successfully saved to the table: {table_name}')
except Exception as e:
    print(f'Error: {str(e)}')

DataFrame successfully saved to the table: table_for_covid_19_dashboard
