In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
def GetDataset(start_date, end_date):
    """
    Fills and filters the OWID COVID-19 dataset by a range of dates.

    Parameters:
        start_date (str): The start date of the range in the format 'YYYY-MM-DD'.
        end_date (str): The end date of the range in the format 'YYYY-MM-DD'.

    Returns:
        pd.DataFrame: Filtered and filled DataFrame containing only the rows within the specified date range.
    """
    # Read OWID COVID-19 dataset
    rawdata = pd.read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv')

    # Convert date columns to datetime type
    rawdata['date'] = pd.to_datetime(rawdata['date'])

    # Filter the DataFrame by date range
    rawdata = rawdata[(rawdata['date'] >= start_date) & (rawdata['date'] <= end_date)]

    # Extract list of continents in dataset
    continents = rawdata['continent'].unique()
    continents = [x for x in continents if str(x) != 'nan']
   
    # Initialize an empty list to store DataFrames
    dfs = []        

    # Set an empty DataFrame to further append all countries with processed data
    ColumnsDF=['continent','location','date','new_cases','total_cases','total_deaths',
                'new_vaccinations','people_vaccinated','people_fully_vaccinated',
               'total_boosters','population','population_density','total_vaccinations']
    
    FullData=pd.DataFrame(columns=ColumnsDF)
     #-----------Creation of directories to save data-------------------------
    path0 = os.getcwd()
    parent_directory = os.path.dirname(path0)
    data_directory = os.path.join(parent_directory, 'data')
    try:
        os.mkdir(data_directory)
    except OSError:
        print ("Directory %s already exists" % data_directory)
    else:
        print ("Successfully created the directory %s" % data_directory)
    
    # ----------------Iterating over continents i-------------------------
    for i in continents:
        #print('Continent: ' + i)
        data = rawdata[rawdata['continent'] == i]
        countries = data['location'].unique()
        # ---------------Going inside continents: iterating over countries j -----------
        for j in countries:
            #print('Country: ' + j)
            data_i = data[data['location'] == j]
            if len(data_i)>1:
                time = pd.to_datetime(data_i['date'])
                Total_cases = data_i['total_cases']
                Total_deaths = data_i['total_deaths']
                New_cases = data_i['new_cases_smoothed']
                Population = data_i['population']
                Density = data_i['population_density']
                
                # Recovered = pd.DataFrame(GetRecovered(data_i['total_cases'].values, data_i['total_deaths'].values)).squeeze()
                
                # -----------Filling the gaps of the vaccinated population----------
                New_vaccinations = data_i['new_vaccinations_smoothed'].replace([0,np.nan], method='ffill')
                New_vaccinations = New_vaccinations.replace([np.nan], 0)
                
                People_fully_vaccinated = data_i['people_fully_vaccinated'].replace([0,np.nan], method='ffill')
                People_fully_vaccinated = People_fully_vaccinated.replace([np.nan], 0)
                
                People_vaccinated = data_i['people_vaccinated'].replace([0,np.nan], method='ffill')
                People_vaccinated = People_vaccinated.replace([np.nan], 0)
                
                Total_boosters = data_i['total_boosters'].replace([0,np.nan], method='ffill')
                Total_boosters = Total_boosters.replace([np.nan], 0)

                Total_vaccinations = data_i['total_vaccinations'].replace([0,np.nan], method='ffill')
                Total_vaccinations = Total_vaccinations.replace([np.nan], 0)
                
                Density = Density.replace([np.nan], 0)
                
                #Recovered = Recovered.replace([np.nan], 0)
                Total_deaths = Total_deaths.replace([np.nan], 0)
                New_cases = New_cases.replace([np.nan], 0)
                Total_cases = Total_cases.replace([np.nan], 0)
                #Active = Total_cases.values - Recovered.values - Total_deaths.values
                #Active[Active <0] = 0
                

                # -------------------Saving processed data-----------------------------------
                DataCountry = {'date': list(time),
                    'new_cases': list(New_cases),
                    'total_cases': list(Total_cases), 
                    'total_deaths': list(Total_deaths), 
                    'new_vaccinations': list(New_vaccinations),
                    'people_vaccinated': list(People_vaccinated),
                    'people_fully_vaccinated': list(People_fully_vaccinated),
                    'total_boosters': list(Total_boosters) ,
                    'population': list(Population) ,
                    'population_density': list(Density),
                    'total_vaccinations': list(Total_vaccinations)
                    }
                # -----------------Setting processed data into DataFrame & append into global DataFrame----------
                DataCountry = pd.DataFrame(DataCountry)
                DataCountry.insert(0, 'continent', i)
                DataCountry.insert(1, 'location', j)
                dfs.append(DataCountry)
    # -----------------Return global DataFrame---------------------
    # Concatenate all DataFrames in the list
    FullData = pd.concat(dfs, ignore_index=True)    
    FullData['date']=pd.to_datetime(FullData['date'])
    # Save the processed DataFrame as CSV
    filename = os.path.join(data_directory, 'covid19_world.csv')
    FullData.to_csv(filename, index=False)  # Save DataFrame to CSV without index

    return FullData
     


In [6]:
# Define start and end dates
start_date = '2022-01-01'
end_date = '2023-12-31'

# Preprocess dataset
df = GetDataset(start_date, end_date)

# Display the filtered DataFrame
print(df)

Directory c:\Users\brand\Desktop\Final Project\data already exists


  New_vaccinations = data_i['new_vaccinations_smoothed'].replace([0,np.nan], method='ffill')
  People_fully_vaccinated = data_i['people_fully_vaccinated'].replace([0,np.nan], method='ffill')
  People_vaccinated = data_i['people_vaccinated'].replace([0,np.nan], method='ffill')
  Total_boosters = data_i['total_boosters'].replace([0,np.nan], method='ffill')
  Total_vaccinations = data_i['total_vaccinations'].replace([0,np.nan], method='ffill')
  New_vaccinations = data_i['new_vaccinations_smoothed'].replace([0,np.nan], method='ffill')
  People_fully_vaccinated = data_i['people_fully_vaccinated'].replace([0,np.nan], method='ffill')
  People_vaccinated = data_i['people_vaccinated'].replace([0,np.nan], method='ffill')
  Total_boosters = data_i['total_boosters'].replace([0,np.nan], method='ffill')
  Total_vaccinations = data_i['total_vaccinations'].replace([0,np.nan], method='ffill')
  New_vaccinations = data_i['new_vaccinations_smoothed'].replace([0,np.nan], method='ffill')
  People_fully_va

            continent     location       date  new_cases  total_cases  \
0                Asia  Afghanistan 2022-01-01     22.429     157902.0   
1                Asia  Afghanistan 2022-01-02     30.000     158112.0   
2                Asia  Afghanistan 2022-01-03     30.000     158112.0   
3                Asia  Afghanistan 2022-01-04     30.000     158112.0   
4                Asia  Afghanistan 2022-01-05     30.000     158112.0   
...               ...          ...        ...        ...          ...   
175599  South America    Venezuela 2023-12-27      0.000     552695.0   
175600  South America    Venezuela 2023-12-28      0.000     552695.0   
175601  South America    Venezuela 2023-12-29      0.000     552695.0   
175602  South America    Venezuela 2023-12-30      0.000     552695.0   
175603  South America    Venezuela 2023-12-31      0.000     552695.0   

        total_deaths  new_vaccinations  people_vaccinated  \
0             7352.0           16935.0                0.0   
1