In [None]:
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import math
from scipy.stats import chi2_contingency
import seaborn as sns
import folium
from folium import plugins 
from folium.plugins import HeatMap, MarkerCluster
import shapely
import json 
from shapely.geometry import Point, MultiPoint, LineString, MultiLineString, Polygon, MultiPolygon
import branca
import branca.colormap as cm
import statsmodels.api as sm
from scipy.stats import pearsonr, spearmanr

In [None]:
PATH = {}
PATH["data_raw"] = "../data/raw/"
PATH["data_interim"] = "../data/interim/"
PATH["data_processed"] = "../data/processed/"
PATH["data_external"] = "../data/external/"

SUBPATH = {}
SUBPATH["corona"] = "corona/"
SUBPATH["meta"] = "metadata/"
SUBPATH["shape"] = "shapefiles/"
SUBPATH["weather"] = "weather/"

FILENAME = {}
FILENAME["corona"] = "de_corona.csv"
FILENAME["meta"] = "de_metadata.json"
FILENAME["shape"] = "de.geojson"
FILENAME["weather"] = "weather.csv"

corona = PATH["data_raw"]+SUBPATH["corona"]+FILENAME["corona"]
meta = PATH["data_raw"]+SUBPATH["meta"]+FILENAME["meta"]
shape = PATH["data_raw"]+SUBPATH["shape"]+FILENAME["shape"]
weather = PATH["data_raw"]+SUBPATH["weather"]+FILENAME["weather"]

In [None]:
from collections import Counter
raw_data = {}

raw_data['corona'] = pd.read_csv(corona,sep = '\t')
raw_data['metadata'] = pd.read_csv(meta,sep = '\t')
raw_data['shape'] = pd.read_csv(shape,sep = '\t')
raw_data['weather'] = pd.read_csv(weather,sep = '\t')

In [None]:
HEADERS = ['corona','weather']
for i in HEADERS:
    print(i +"  \tDataFrame shape: " + str(raw_data[i].shape))

In [None]:
def check_null_values(file):
    if raw_data[file].isnull().values.any():
        print('There are null values in the dataset')
    else:
        print('There are no null values in the dataset')

In [None]:
check_null_values('corona')

In [None]:
weather_data_de = raw_data["weather"][raw_data["weather"]["iso3166-2"].str.startswith("DE")]
weather_data_de

In [None]:
def dataset_checker_values(dataset,value):
    SA = dataset.copy()
    SA.replace(value, np.nan, inplace=True)
    missingdata_df = SA.columns[SA.isnull().any()].tolist()
    msno.matrix(SA);

In [None]:
#dataset_checker_values(raw_data['corona'],-999)

# Association Build-up

In [None]:
with open(meta, 'r',encoding = 'utf-8') as j:
     contents = json.loads(j.read())
country_metadata = pd.DataFrame(contents['country_metadata'])
sorted_country_metadata = country_metadata.sort_values(by = 'iso3166-2_name_en')
#a = pd.DataFrame(raw_data['corona'].value_counts())



#total  of covid cases per region
corona_by_region  = pd.DataFrame(raw_data['corona'].groupby(by = 'region_code').sum())
corona_by_region['iso3166-2_code'] = sorted_country_metadata['iso3166-2_code'].values
corona_by_region['population'] = sorted_country_metadata['population'].values
#corona_by_region
    
#final_df

In [None]:
#similar to how Michele did
corona_df = raw_data['corona']
ss = dict()
for i in contents['country_metadata']:
    ss[i['covid_region_code']] = i['iso3166-2_code']
corona_df['region'] = corona_df['region_code'].map(ss)
corona_df

In [None]:
daily_corona_weather = corona_df.merge(weather_data_de, left_on = ["date", "region"], right_on = ["date", "iso3166-2"])
daily_corona_weather = daily_corona_weather.drop(["date", "region_code", "region"], axis = 1)
#daily_corona_weather

In [None]:
weather_by_region = pd.DataFrame(weather_data_de.groupby(by = 'iso3166-2').mean())
#weather_by_region

In [None]:
weather_date = pd.to_datetime(weather_data_de['date'])
weather_month = pd.DataFrame(weather_data_de.groupby(weather_date.dt.month).mean())
celsius = pd.DataFrame(weather_month['TemperatureAboveGround'].subtract(273.15))
weather_month['Temp_Celsius'] = celsius

In [None]:
#function for plotting bar plots for weather
def weather_mean_bar_plot(column,df):
    list_month = ['February','March','April','May','June','July','August','September','October','November']
    fig,ax = plt.subplots(figsize = (16,4))
    df[column].plot.bar(color = 'silver',edgecolor = 'black',rot = 0)
    ax.set_title('Mean of' + ' '+ column)
    ax.set_ylabel('Value')
    ax.set_xlabel('Month')
    ax.set_xticklabels(list_month)
#weather_mean_bar_plot('SolarRadiation',weather_month)

In [None]:
corona_date = pd.to_datetime(raw_data['corona']['date'])
corona_2020=pd.DataFrame(raw_data['corona'][corona_date.dt.year == 2020])
corona_month_sum = pd.DataFrame(corona_2020.groupby(corona_date.dt.month).sum())
corona_month_mean = pd.DataFrame(corona_2020.groupby(corona_date.dt.month).mean())
right_one = corona_month[1:11]

In [None]:
#function for plotting bar plots for corona infections
def corona_total_bar_plot(column,df):
    list_month = ['February','March','April','May','June','July','August','September','October','November']
    fig,ax = plt.subplots(figsize = (16,4))
    df[column].plot.bar(color = 'silver',edgecolor = 'black',rot = 0)
    ax.set_title(column)
    ax.set_ylabel('Value')
    ax.set_xlabel('Month')
    ax.set_xticklabels(list_month)
#corona_total_bar_plot('confirmed_addition',right_one)
#right_one

# Associations

In [None]:
#corona and weather conditions grouped by months
df_monthly_sum = corona_month_sum.merge(weather_month,left_on = 'date',right_on = 'date')
df_monthly_mean = corona_month_mean.merge(weather_month,left_on = 'date',right_on = 'date')

In [None]:
#pearson association for daily_corona_weather
weather_headers = ['RelativeHumiditySurface', 'SolarRadiation', 'Surfacepressure', 'TemperatureAboveGround',
             'Totalprecipitation', 'UVIndex', 'WindSpeed'] #'Temp_Celsius']
significance_threshold = 0.001 / (len(weather_headers) * 3)
corrs = []
pvalues = []
for var in weather_headers:
    corr, pvalue = pearsonr(daily_corona_weather['confirmed_addition'], daily_corona_weather[var])
    #print(f"{var}\n{corr:.3f}\t{pvalue}\t{pvalue < significance_threshold}\n")

In [None]:
#spearman assoctiation for daily_corona_weather
for var in weather_headers:
    corr, pvalue = spearmanr(daily_corona_weather['confirmed_addition'], daily_corona_weather[var])
    #print(f"{var}\n{corr:.3f}\t{pvalue}\t{pvalue < significance_threshold}\n")

In [None]:
#logarithmic association for daily_corona_weather
for var in weather_headers:
    corr, pvalue = pearsonr(np.log(daily_corona_weather['confirmed_addition']), daily_corona_weather[var])
    #print(f"{var}\n{corr:.3f}\t{pvalue}\t{pvalue < significance_threshold}\n")