# COVID Climate Analysis

In [1]:
import os
import sys
import cdsapi
import cdsapi
import requests

In [2]:
# Since we share the source code module in a same level folder:
sys.path.append('..')

In [3]:
import numpy as np
import pandas as pd
import xarray as xr
import plotnine as p9
import geopandas as gpd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from tqdm import tqdm
#from atmos import calculate
from shapely.geometry import Point, MultiPolygon
from collections import defaultdict
from metpy.calc import relative_humidity_from_dewpoint
from mizani.formatters import date_format, percent_format



### Collecting Data

#### COVID 19

We fetch the COVID19 daily cases from the OWD's open data repository, given a start and end date:

In [6]:
def COVID_Dataset(start_date, end_date):
    
    # Read OWID COVID-19 dataset
    rawdata = pd.read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv')

    # Convert date columns to datetime type
    rawdata['date'] = pd.to_datetime(rawdata['date'])

    # Filter the DataFrame by date range
    rawdata = rawdata[(rawdata['date'] >= start_date) & (rawdata['date'] <= end_date)]

    # Extract list of continents in dataset
    continents = rawdata['continent'].unique()
    continents = [x for x in continents if str(x) != 'nan']
   
    # Initialize an empty list to store DataFrames
    dfs = []        

    # Set an empty DataFrame to further append all countries with processed data
    ColumnsDF=['date','continent','location','iso_code','new_cases','total_cases','total_deaths',
                'new_vaccinations','people_vaccinated','people_fully_vaccinated',
               'total_boosters','population','population_density','total_vaccinations']
    
    FullData=pd.DataFrame(columns=ColumnsDF)
    
     #-----------Creation of directories to save data-------------------------
    path0 = os.getcwd()
    parent_directory = os.path.dirname(path0)
    data_directory = os.path.join(parent_directory, 'data')
    try:
        os.mkdir(data_directory)
    except OSError:
        print ("Directory %s already exists" % data_directory)
    else:
        print ("Successfully created the directory %s" % data_directory)
    
    # ----------------Iterating over continents i-------------------------
    for i in continents:
        #print('Continent: ' + i)
        data = rawdata[rawdata['continent'] == i]
        countries = data['location'].unique()
        # ---------------Going inside continents: iterating over countries j -----------
        for j in countries:
            #print('Country: ' + j)
            data_i = data[data['location'] == j]
            if len(data_i)>1:
                time = pd.to_datetime(data_i['date'])
                iso_code = data_i['iso_code']
                Total_cases = data_i['total_cases']
                Total_deaths = data_i['total_deaths']
                New_cases = data_i['new_cases_smoothed']
                Population = data_i['population']
                Density = data_i['population_density']
                
                # Recovered = pd.DataFrame(GetRecovered(data_i['total_cases'].values, data_i['total_deaths'].values)).squeeze()
                
                # -----------Filling the gaps of the vaccinated population----------
                New_vaccinations = data_i['new_vaccinations_smoothed'].replace([0,np.nan], method='ffill')
                New_vaccinations = New_vaccinations.replace([np.nan], 0)
                
                People_fully_vaccinated = data_i['people_fully_vaccinated'].replace([0,np.nan], method='ffill')
                People_fully_vaccinated = People_fully_vaccinated.replace([np.nan], 0)
                
                People_vaccinated = data_i['people_vaccinated'].replace([0,np.nan], method='ffill')
                People_vaccinated = People_vaccinated.replace([np.nan], 0)
                
                Total_boosters = data_i['total_boosters'].replace([0,np.nan], method='ffill')
                Total_boosters = Total_boosters.replace([np.nan], 0)

                Total_vaccinations = data_i['total_vaccinations'].replace([0,np.nan], method='ffill')
                Total_vaccinations = Total_vaccinations.replace([np.nan], 0)
                
                Density = Density.replace([np.nan], 0)
                
                #Recovered = Recovered.replace([np.nan], 0)
                Total_deaths = Total_deaths.replace([np.nan], 0)
                New_cases = New_cases.replace([np.nan], 0)
                Total_cases = Total_cases.replace([np.nan], 0)
                #Active = Total_cases.values - Recovered.values - Total_deaths.values
                #Active[Active <0] = 0
                

                # -------------------Saving processed data-----------------------------------
                DataCountry = {'date': list(time),
                    'iso_code': list(iso_code),
                    'new_cases': list(New_cases),
                    'total_cases': list(Total_cases), 
                    'total_deaths': list(Total_deaths), 
                    'new_vaccinations': list(New_vaccinations),
                    'people_vaccinated': list(People_vaccinated),
                    'people_fully_vaccinated': list(People_fully_vaccinated),
                    'total_boosters': list(Total_boosters) ,
                    'population': list(Population) ,
                    'population_density': list(Density),
                    'total_vaccinations': list(Total_vaccinations)
                    }
                # -----------------Setting processed data into DataFrame & append into global DataFrame----------
                DataCountry = pd.DataFrame(DataCountry)
                DataCountry.insert(1, 'continent', i)
                DataCountry.insert(2, 'location', j)
                dfs.append(DataCountry)
    # -----------------Return global DataFrame---------------------
    # Concatenate all DataFrames in the list
    FullData = pd.concat(dfs, ignore_index=True)    
    FullData['date']=pd.to_datetime(FullData['date'])
    # Save the processed DataFrame as CSV
    filename = os.path.join(data_directory, 'covid19_world.csv')
    FullData.to_csv(filename, index=False)  # Save DataFrame to CSV without index

    return FullData

In [7]:
# Define start and end dates
start_date = '2023-01-01'
end_date = '2023-12-31'

# Preprocessed dataset
covid_df = COVID_Dataset(start_date, end_date)

Directory C:\Users\brand\Desktop\Final Project\data already exists




Let's take a glance at  how the first 20 days look for Spain after surpassing the 20 confirmed COVID-19 cases:

In [8]:
covid_df.query('location == "Spain" & total_cases >= 20').head(20)

Unnamed: 0,date,continent,location,iso_code,new_cases,total_cases,total_deaths,new_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,population,population_density,total_vaccinations
35277,2023-01-01,Europe,Spain,ESP,2117.571,13769785.0,119153.0,19634.0,0.0,0.0,0.0,47558632.0,93.105,0.0
35278,2023-01-02,Europe,Spain,ESP,2117.571,13769785.0,119153.0,19488.0,0.0,0.0,0.0,47558632.0,93.105,0.0
35279,2023-01-03,Europe,Spain,ESP,2117.571,13769785.0,119153.0,19342.0,41334113.0,40720794.0,26400571.0,47558632.0,93.105,104301730.0
35280,2023-01-04,Europe,Spain,ESP,2117.571,13769785.0,119153.0,20652.0,41334113.0,40720794.0,26400571.0,47558632.0,93.105,104301730.0
35281,2023-01-05,Europe,Spain,ESP,2117.571,13769785.0,119153.0,22108.0,41334113.0,40720794.0,26400571.0,47558632.0,93.105,104301730.0
35282,2023-01-06,Europe,Spain,ESP,2117.571,13769785.0,119153.0,23565.0,41334113.0,40720794.0,26400571.0,47558632.0,93.105,104301730.0
35283,2023-01-07,Europe,Spain,ESP,2117.571,13769785.0,119153.0,25021.0,41334113.0,40720794.0,26400571.0,47558632.0,93.105,104301730.0
35284,2023-01-08,Europe,Spain,ESP,1809.0,13782448.0,119355.0,26478.0,41334113.0,40720794.0,26400571.0,47558632.0,93.105,104301730.0
35285,2023-01-09,Europe,Spain,ESP,1809.0,13782448.0,119355.0,27934.0,41334113.0,40720794.0,26400571.0,47558632.0,93.105,104301730.0
35286,2023-01-10,Europe,Spain,ESP,1809.0,13782448.0,119355.0,29391.0,41334113.0,40720794.0,26400571.0,47558632.0,93.105,104301730.0


# Climate

We are fetching the weather data from the Copernicus Climate Data Store (CDS), which gives us access to (among others) the ERA5 hourly data on single levels from 1979 to present. We download the grid at a 0.5x0.5 degree resolution directly through the API.

For the API calls to work, a client needs to be instantiated which will read the private keys in /home/$USER/.cdsapirc:

#### Retrieval from CDS API

In [15]:
c = cdsapi.Client()

In [19]:
def GetClimate(years):
    
    output_directory = "../data/climate"
    os.makedirs(output_directory, exist_ok=True)

    # Retrieve all months for a given year.
    months = [str(month).zfill(2) for month in range(1, 13)]  # All months from 01 to 12

    # Select the required statistic and variable
    stat = "daily_mean"

    # For valid keywords, see Table 2 of:
    # https://datastore.copernicus-climate.eu/documents/app-c3s-daily-era5-statistics/C3S_Application-Documentation_ERA5-daily-statistics-v2.pdf
    
    for yr in years:
        for mn in months:
            file_name = "download_" + stat + "_" + yr + "_" + mn + ".nc"

            # Prepend the directory to the file name
            file_path = os.path.join(output_directory, file_name)

            # Check if file already exists
            if os.path.exists(file_path):
                print(f"File '{file_name}' already exists. Skipping...")
                continue

            result = c.service(
                "tool.toolbox.orchestrator.workflow",
                params={
                    "realm": "user-apps",
                    "project": "app-c3s-daily-era5-statistics",
                    "version": "master",
                    "kwargs": {
                        "dataset": "reanalysis-era5-single-levels",
                        "product_type": "reanalysis",
                        "variable": [
                            '2m_dewpoint_temperature',
                            '2m_temperature',
                            'mean_sea_level_pressure',
                            'total_precipitation',
                        ],
                        "statistic": stat,
                        "year": yr,
                        "month": mn,
                        "time_zone": "UTC+00:0",
                        "frequency": "1-hourly",
                        # Users can change the output grid resolution and selected area
                        "grid": "0.5/0.5",
                        # "area":{"lat": [10, 60], "lon": [65, 140]}
                    },
                    "workflow_name": "application"
                })

            location = result[0]['location']
            res = requests.get(location, stream=True)
            print("Writing data to " + file_name)
            with open(file_path, 'wb') as fh:
                for r in res.iter_content(chunk_size=1024):
                    fh.write(r)

In [20]:
#Usage:
years = ['2023']

GetClimate(years)

2024-04-07 18:29:53,856 INFO Welcome to the CDS
2024-04-07 18:29:53,856 INFO Sending request to https://cds.climate.copernicus.eu/api/v2/tasks/services/tool/toolbox/orchestrator/workflow/clientid-da35c65890a9431ca5322cb14d25689e
2024-04-07 18:29:53,938 INFO Request is queued
2024-04-07 18:29:54,975 INFO Request is running
2024-04-07 18:29:58,806 INFO Request is failed
2024-04-07 18:29:58,806 ERROR Message: 
2024-04-07 18:29:58,806 ERROR Reason:  Traceback (most recent call last):
  File "/opt/cdstoolbox/cdscompute/cdscompute/cdshandlers/services/handler.py", line 59, in handle_request
    result = cached(context.method, proc, context, context.args, context.kwargs)
  File "/opt/cdstoolbox/cdscompute/cdscompute/caching.py", line 108, in cached
    result = proc(context, *context.args, **context.kwargs)
  File "/opt/cdstoolbox/cdscompute/cdscompute/services.py", line 124, in __call__
    return p(*args, **kwargs)
  File "/opt/cdstoolbox/cdscompute/cdscompute/services.py", line 60, in __ca

Exception: . Traceback (most recent call last):
  File "/opt/cdstoolbox/cdscompute/cdscompute/cdshandlers/services/handler.py", line 59, in handle_request
    result = cached(context.method, proc, context, context.args, context.kwargs)
  File "/opt/cdstoolbox/cdscompute/cdscompute/caching.py", line 108, in cached
    result = proc(context, *context.args, **context.kwargs)
  File "/opt/cdstoolbox/cdscompute/cdscompute/services.py", line 124, in __call__
    return p(*args, **kwargs)
  File "/opt/cdstoolbox/cdscompute/cdscompute/services.py", line 60, in __call__
    return self.proc(context, *args, **kwargs)
  File "/home/cds/cdsservices/services/python_service.py", line 38, in execute
    raise exceptions.InternalError(logging + traceback, '')
cdsclient.exceptions.InternalError: Traceback (most recent call last):
  File "/opt/cdstoolbox/jsonrequest/jsonrequest/requests.py", line 71, in jsonrequestcall
    resp = coding.encode(req.callable(*req.args, **req.kwargs), register=encoders, **context)
  File "/opt/cdstoolbox/cdstools/cdstools/util.py", line 367, in resample
    resample = data.resample(label=label, closed=closed, keep_attrs=keep_attrs,
AttributeError: 'list' object has no attribute 'resample'.

## Data Processing

### Climate

#### Filtering relevant coordinates

In [None]:
if not os.path.exists('../../data/coords_region.csv'):
    lon_lat = (xr.load_dataset('../../data/era5/world/2020_02_0_weather.nc')
               .to_dataframe()
               .reset_index()
               [['longitude', 'latitude']]
               .drop_duplicates()
               .assign(longitude=lambda dd: dd.longitude.apply(lambda x: x if x < 180 else x - 360))
              )