# Group 10 - First Year Project
### Data Science at ITU 
## Project 2 - Correlation between Covid-19's spread and weather conditions

#Description

This notebook contains all the code developed in the Project 2 - Correlation between Covid-19's spread and weather conditions

Contact/Group: 
- Florian Micliuc(flmi@itu.dk)
- Louis Caspar Brandt (locb@itu.dk)
- Iben Mai Huse (ibhu@itu.dk)
- Katalin Literati-Dobos (klit@itu.dk)
- Ruben Jonsman (rubj@itu.dk)

#  Library imports

In [None]:
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import math
from collections import Counter
from scipy.stats import chi2_contingency
import seaborn as sns
import folium
from folium import plugins 
from folium.plugins import HeatMap, MarkerCluster
import shapely
import json 
from shapely.geometry import Point, MultiPoint, LineString, MultiLineString, Polygon, MultiPolygon
import branca
import branca.colormap as cm
import statsmodels.api as sm
from scipy.stats import pearsonr, spearmanr

# Functions

In [None]:
def check_null_values(file):
    if file.isnull().values.any():
        print('There are null values in the dataset')
    else:
        print('There are no null values in the dataset')

In [None]:
def dataset_checker_values(dataset,value):
    SA = dataset.copy()
    SA.replace(value, np.nan, inplace=True)
    missingdata_df = SA.columns[SA.isnull().any()].tolist()
    msno.matrix(SA);

In [None]:
def weather_mean_bar_plot(column,df):
    list_month = ['February','March','April','May','June','July','August','September','October','November','December']
    fig,ax = plt.subplots(figsize = (16,4))
    df[column].plot.bar(color = 'silver',edgecolor = 'black',rot = 0)
    ax.set_title('Mean of' + ' '+ column)
    ax.set_ylabel('Value')
    ax.set_xlabel('Month')
    ax.set_xticklabels(list_month)

In [None]:
def corona_total_bar_plot(column,df):
    list_month = ['February','March','April','May','June','July','August','September','October','November','December']
    fig,ax = plt.subplots(figsize = (16,4))
    df[column].plot.bar(color = 'silver',edgecolor = 'black',rot = 0)
    ax.set_title(column)
    ax.set_ylabel('Value')
    ax.set_xlabel('Month')
    ax.set_xticklabels(list_month)

In [None]:
def corona_line_plot(df,x,y,title):
    ax = df.plot.line(x = x, y = y,rot = 45,figsize = (20,6),color = 'silver',title = title,ylabel = 'Corona Cases',xlabel = 'Date')

# Task 0 - Data cleaning and filtering

## Loading data
#Description

In [None]:
PATH = {}
PATH["data_raw"] = "../data/raw/"
PATH["data_interim"] = "../data/interim/"
PATH["data_processed"] = "../data/processed/"
PATH["data_external"] = "../data/external/"

SUBPATH = {}
SUBPATH["corona"] = "corona/"
SUBPATH["meta"] = "metadata/"
SUBPATH["shape"] = "shapefiles/"
SUBPATH["weather"] = "weather/"

FILENAME = {}
FILENAME["corona"] = "de_corona.csv"
FILENAME["meta"] = "de_metadata.json"
FILENAME["shape"] = "de.geojson"
FILENAME["weather"] = "weather.csv"

corona = PATH["data_raw"]+SUBPATH["corona"]+FILENAME["corona"]
meta = PATH["data_raw"]+SUBPATH["meta"]+FILENAME["meta"]
shape = PATH["data_raw"]+SUBPATH["shape"]+FILENAME["shape"]
weather = PATH["data_raw"]+SUBPATH["weather"]+FILENAME["weather"]

In [None]:
raw_data = {}

from datetime import datetime
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')

raw_data['de_corona'] = pd.read_csv(corona, sep='\s+', parse_dates=['date'], date_parser=dateparse)
raw_data['weather'] = pd.read_csv(weather, sep='\s+', parse_dates=['date'], date_parser=dateparse)
additional = pd.read_csv('../Data/Raw/weather/weather2.csv', sep='\t', parse_dates=['date'], date_parser=dateparse)
#raw_data['weather'] = raw_data['weather'].append(additional)
raw_data['weather']= pd.concat([raw_data['weather'], additional], axis=0)

with open(meta, 'r', encoding='utf-8') as f:
    country_metadata = json.loads(f.read())

In [None]:
HEADERS = ['de_corona','weather']
for i in HEADERS:
    print(i +"  \tDataFrame shape: " + str(raw_data[i].shape))

 ### Data type insight
 #Description

## Filtering the weather dataset

In [None]:
weather_data_de = raw_data["weather"][raw_data["weather"]["iso3166-2"].str.startswith("DE")]
weather_data_de.reset_index(drop=True, inplace=True)
#weather_data_de

## Relational data table
#Description

## Sanity check
#Description

### Check for null values

In [None]:
check_null_values(raw_data['de_corona'])
check_null_values(weather_data_de)

### Checking and visualizing for weird values

In [None]:
#dataset_checker_values(weather_data_de,-999) #-999

In [None]:
#dataset_checker_values(weather_data_de,-1) #-1

In [None]:
#dataset_checker_values(raw_data['de_corona'],-999) #-999

In [None]:
#dataset_checker_values(raw_data['de_corona'],-1) #-1

## Checking for duplicates, should we do that?

# Task 1 - Single Variable Analysis

In [None]:
#similar to how Michele did, creating the corona dataset with the regions
corona_df = raw_data['de_corona']
ss = dict()
for i in country_metadata['country_metadata']:
    ss[i['covid_region_code']] = i['iso3166-2_code']
corona_df['region'] = corona_df['region_code'].map(ss)
#corona_df

In [None]:
#total  of covid cases per region
meta_df = pd.DataFrame(country_metadata['country_metadata'])
sorted_country_metadata = meta_df.sort_values(by = 'iso3166-2_name_en')
corona_by_region  = pd.DataFrame(raw_data['de_corona'].groupby(by = 'region_code').sum())
corona_by_region['iso3166-2_code'] = sorted_country_metadata['iso3166-2_code'].values
corona_by_region['population'] = sorted_country_metadata['population'].values
#corona_by_region

In [None]:
#merging the corona dataset with the weather dataset
daily_corona_weather = corona_df.merge(weather_data_de, left_on = ["date", "region"], right_on = ["date", "iso3166-2"])
daily_corona_weather = daily_corona_weather.drop(["date", "region_code", "region"], axis = 1)
#daily_corona_weather

# Monthly Analysis

In [None]:
weather_date = pd.to_datetime(weather_data_de['date'])
weather_2020=pd.DataFrame(weather_data_de[weather_date.dt.year == 2020])
weather_month = pd.DataFrame(weather_2020.groupby(weather_date.dt.month).mean())
celsius = pd.DataFrame(weather_month['TemperatureAboveGround'].subtract(273.15))
weather_month['Temp_Celsius'] = celsius
#weather_month

In [None]:
#weather_mean_bar_plot('SolarRadiation',weather_month)

In [None]:
corona_date = pd.to_datetime(raw_data['de_corona']['date'])
corona_2020=pd.DataFrame(raw_data['de_corona'][corona_date.dt.year == 2020])
corona_month_sum = pd.DataFrame(corona_2020.groupby(corona_date.dt.month).sum())
corona_month_mean = pd.DataFrame(corona_2020.groupby(corona_date.dt.month).mean())
right_one = corona_month_sum[1:12]

In [None]:
#corona_total_bar_plot('confirmed_addition',right_one)

In [None]:
#corona_total_bar_plot('deceased_addition',right_one)

# Region and Weather Mean

In [None]:
weather_by_region = pd.DataFrame(weather_data_de.groupby(by = 'iso3166-2').mean())
#weather_by_region

# Task 2 - Associations

In [None]:
#corona and weather conditions grouped by months
df_monthly_sum = corona_month_sum.merge(weather_month,left_on = 'date',right_on = 'date')
df_monthly_mean = corona_month_mean.merge(weather_month,left_on = 'date',right_on = 'date')

In [None]:
weather_headers = ['RelativeHumiditySurface', 'SolarRadiation', 'Surfacepressure', 'TemperatureAboveGround',
             'Totalprecipitation', 'UVIndex', 'WindSpeed'] #'Temp_Celsius']
significance_threshold = 0.001 / (len(weather_headers) * 3)

In [None]:
#pearson association for daily_corona_weather
corrs = []
pvalues = []
for var in weather_headers:
    corr, pvalue = pearsonr(daily_corona_weather['confirmed_addition'], daily_corona_weather[var])
    #print(f"{var}\n{corr:.3f}\t{pvalue}\t{pvalue < significance_threshold}\n")

In [None]:
#weird values to be checked, could be because of additional data
for var in weather_headers:
    corr, pvalue = spearmanr(daily_corona_weather['confirmed_addition'], daily_corona_weather[var])
    #print(f"{var}\n{corr:.3f}\t{pvalue}\t{pvalue < significance_threshold}\n")

In [None]:
#weird values to be checked, could be because of additional data
for var in weather_headers:
    corr, pvalue = pearsonr(np.log(daily_corona_weather['confirmed_addition']), daily_corona_weather[var])
    #print(f"{var}\n{corr:.3f}\t{pvalue}\t{pvalue < significance_threshold}\n")

In [None]:
#to be checked thoroughly
df = sm.add_constant(daily_corona_weather)
#weather_headers.extend(["const"])

est = sm.OLS(df["confirmed_addition"], df[weather_headers], hasconst = True).fit()
#print(est.summary())

est = sm.OLS(df["deceased_addition"], df[weather_headers], hasconst = True).fit()
#print(est.summary())

In [None]:
mask = (corona_df['date'] > "2020-11-02") & (corona_df['date'] <= "2021-03-07")
mask1 = (corona_df['date'] < "2020-11-02")

In [None]:
#plot is weird, to be checked
corona_line_plot(corona_df,'date','confirmed_addition','Corona lockdown plot')
plt.axvline(x=18570,color = 'red',ls = '--');

In [None]:
#weird plot, to be checked
corona_line_plot(corona_df[mask],'date','confirmed_addition','Corona lockdown plot')

In [None]:
corona_line_plot(corona_df,'date','deceased_addition','Corona lockdown plot')
plt.axvline(x=18570,color = 'red',ls = '--');

In [None]:
#weird plot, to be checked
corona_line_plot(corona_df[mask],'date','deceased_addition','Corona lockdown plot')

# Dummy Variable

In [None]:
corona_df['Lockdown'] = 'False'
corona_df.loc[(corona_df["date"] >= "2020-11-02") & (corona_df["date"] <= "2021-03-07"), "Lockdown"] = 'True'

# Task 3 - Map visualization

In [None]:
# First task, we need to convert the region names in the corona data with the iso3166-2 codes
# To do so, first we need a dictionary mapping the region name to the code
region_map = {country_metadata["country_metadata"][i]["covid_region_code"]: country_metadata["country_metadata"][i]["iso3166-2_code"] for i in range(len(country_metadata["country_metadata"]))}

# Then, we use the Series.map function
raw_data["de_corona"]["region"] = raw_data["de_corona"]["region_code"].map(region_map)

# Second task, we can now calculate how many cases there were in total for the region
corona_df_by_region = raw_data["de_corona"].groupby(by = "region")["confirmed_addition"].sum().reset_index()
dead_by_region      = raw_data["de_corona"].groupby(by = "region")["deceased_addition"].sum().reset_index()
# If we want to add population data, we need to create another dictionary from the country metadata
# In this case, we need to map from the region iso3166-2 code to the region's population
population_map = {
    country_metadata["country_metadata"][i]["iso3166-2_code"]: 
    country_metadata["country_metadata"][i]["population"] 
    for i in range(len(country_metadata["country_metadata"]))}

# Then we can map, just like before
corona_df_by_region["population"] = corona_df_by_region["region"].map(population_map)

corona_df_by_region["ratio"] = corona_df_by_region["confirmed_addition"]/corona_df_by_region["population"]

In [None]:
n_cases = folium.Map(location = [51.5, 10.7], zoom_start = 6)

folium.Choropleth(
    geo_data = shape,
    name = "cases",
    data = corona_df_by_region,
    columns = ["region", "confirmed_addition"],
    key_on = "properties.iso_3166_2",
    fill_color = "OrRd", 
    fill_opacity = 0.7,
    line_opacity = 0.2,
    highlight=True,
    legend_name = "Number of Cases of Covid-19").add_to(n_cases)
n_cases

In [None]:
dead = folium.Map(location = [51.5, 10.7], zoom_start = 6)

folium.Choropleth(
    geo_data = shape,
    name = "cases",
    data = dead_by_region,
    columns = ["region", "deceased_addition"],
    key_on = "properties.iso_3166_2",
    fill_color = "Greys", 
    fill_opacity = 0.7,
    line_opacity = 0.2,
    highlight=True,
    legend_name = "Number of Deceased with an covid-19 infection").add_to(dead)
dead

In [None]:
p_cases = folium.Map(location = [51.5, 10.7], zoom_start = 6)
folium.Choropleth(
    geo_data = shape,
    name = "population",
    data = corona_df_by_region,
    columns = ["region", "population"],
    key_on = "properties.iso_3166_2",
    fill_color = "YlGn", 
    fill_opacity = 0.7,
    line_opacity = 0.2,
    highlight=True,
    legend_name = "Population").add_to(p_cases)
p_cases

In [None]:
r_cases = folium.Map(location = [51.5, 10.7], zoom_start = 6)

tiles = ['stamenwatercolor', 'cartodbpositron', 'openstreetmap', 'stamenterrain']
for tile in tiles:
    folium.TileLayer(tile).add_to(r_cases)

folium.Choropleth(
    geo_data = shape,
    name = "cases",
    data = corona_df_by_region,
    columns = ["region", "ratio"],
    key_on = "properties.iso_3166_2",
    fill_color = "OrRd", 
    fill_opacity = 0.7,
    line_opacity = 0.2,
    legend_name = "Number of Cases pr. population of region",
    highlight=True
).add_to(r_cases)

folium.LayerControl().add_to(r_cases)

r_cases

In [None]:
# dividing by 24 since there is 24 hours in a day, though this takes into account night where the uvindex is 
# really small/nonexsistent
# Could be argued that it should be divided by 16-18 since these are the hours where the sun is present. 
UV_by_region = interim_data["weather"].groupby(by = "iso3166-2")["UVIndex"].mean().reset_index()
UV_by_region["UVIndex_scaled"] = UV_by_region["UVIndex"]/24

In [None]:
uv_index = folium.Map(location = [51.5, 10.7], zoom_start = 6)

folium.Choropleth(
    geo_data = shape,
    name = "uv-index",
    data = UV_by_region,
    columns = ["iso3166-2", "UVIndex_scaled"],
    key_on = "properties.iso_3166_2",
    fill_color = "OrRd", 
    fill_opacity = 0.7,
    line_opacity = 0.2,
    highlight=True,
    legend_name = "UVIndex",
    smooth_factor=0).add_to(uv_index)
uv_index

In [None]:
temp_by_region = interim_data["weather"].groupby(by = "iso3166-2")["TemperatureAboveGround"].mean().reset_index()
temp_by_region["TemperatureAboveGround_scaled"] = temp_by_region["TemperatureAboveGround"]/24

In [None]:
TAG = folium.Map(location = [51.5, 10.7], zoom_start = 6)

folium.Choropleth(
    geo_data = shape,
    name = "uv-index",
    data = temp_by_region,
    columns = ["iso3166-2", "TemperatureAboveGround_scaled"],
    key_on = "properties.iso_3166_2",
    fill_color = "OrRd", 
    fill_opacity = 0.7,
    line_opacity = 0.2,
    highlight=True,
    legend_name = "average Temperature Above Ground on a daily basis").add_to(TAG)
TAG

In [None]:
raw_data['de_corona']['month'] = raw_data['de_corona']['date'].dt.strftime('%b')
raw_data['de_corona']['year'] = raw_data['de_corona']['date'].dt.strftime('%Y')
raw_data["de_corona"]["population"] = raw_data["de_corona"]["region"].map(population_map)
raw_data["de_corona"]["confirmed_addition_ratio"] = raw_data["de_corona"]["confirmed_addition"]/raw_data["de_corona"]["population"]

In [None]:
cases_2020_monthly = raw_data["de_corona"][(raw_data["de_corona"]["year"] == "2020")].groupby(["month", "region"]).sum()
cases_2021_monthly = raw_data["de_corona"][(raw_data["de_corona"]["year"] == "2021")].groupby(["month", "region"]).sum()

In [None]:
confirmed_addition_ratio_maps = dict()
for i in list(raw_data["de_corona"][(raw_data["de_corona"]["year"] == "2020")]["month"].unique()):
    n_cases = folium.Map(location = [51.5, 10.7], zoom_start = 6)

    folium.Choropleth(
        geo_data = shape,
        name = "cases",
        data = cases_2020_monthly.loc[i].reset_index(),
        columns = ["region", "confirmed_addition_ratio"],
        key_on = "properties.iso_3166_2",
        fill_color = "OrRd", 
        fill_opacity = 0.7,
        line_opacity = 0.2,
        highlight=True,
        nan_fill_color = "purple",
        legend_name = "Number of Cases of Covid-19 pr. population of region in {} 2020".format(i)).add_to(n_cases)
    confirmed_addition_ratio_maps[i+"-2020"] = n_cases
for i in list(raw_data["de_corona"][(raw_data["de_corona"]["year"] == "2021")]["month"].unique()):
    n_cases = folium.Map(location = [51.5, 10.7], zoom_start = 6)

    folium.Choropleth(
        geo_data = shape,
        name = "cases",
        data = cases_2021_monthly.loc[i].reset_index(),
        columns = ["region", "confirmed_addition_ratio"],
        key_on = "properties.iso_3166_2",
        fill_color = "OrRd", 
        fill_opacity = 0.7,
        line_opacity = 0.2,
        highlight=True,
        nan_fill_color = "purple",
        legend_name = "Number of Cases of Covid-19 pr. population of region in {} 2021".format(i)).add_to(n_cases)
    confirmed_addition_ratio_maps[i+"-2021"] = n_cases    

In [None]:
confirmed_addition_maps = dict()
for i in list(raw_data["de_corona"][(raw_data["de_corona"]["year"] == "2020")]["month"].unique()):
    n_cases = folium.Map(location = [51.5, 10.7], zoom_start = 6)

    folium.Choropleth(
        geo_data = shape,
        name = "cases",
        data = cases_2020_monthly.loc[i].reset_index(),
        columns = ["region", "confirmed_addition"],
        key_on = "properties.iso_3166_2",
        fill_color = "OrRd", 
        fill_opacity = 0.7,
        line_opacity = 0.2,
        highlight=True,
        nan_fill_color = "purple",
        legend_name = "Number of Cases of Covid-19 in {} 2020".format(i)).add_to(n_cases)
    confirmed_addition_maps[i+"-2020"] = n_cases
for i in list(raw_data["de_corona"][(raw_data["de_corona"]["year"] == "2021")]["month"].unique()):
    n_cases = folium.Map(location = [51.5, 10.7], zoom_start = 6)

    folium.Choropleth(
        geo_data = shape,
        name = "cases",
        data = cases_2021_monthly.loc[i].reset_index(),
        columns = ["region", "confirmed_addition"],
        key_on = "properties.iso_3166_2",
        fill_color = "OrRd", 
        fill_opacity = 0.7,
        line_opacity = 0.2,
        highlight=True,
        nan_fill_color = "purple",
        legend_name = "Number of Cases of Covid-19 in {} 2021".format(i)).add_to(n_cases)
    confirmed_addition_maps[i+"-2021"] = n_cases    

In [None]:
deceased_addition_maps = dict()
for i in list(raw_data["de_corona"][(raw_data["de_corona"]["year"] == "2020")]["month"].unique()):
    n_cases = folium.Map(location = [51.5, 10.7], zoom_start = 6)

    folium.Choropleth(
        geo_data = shape,
        name = "cases",
        data = cases_2020_monthly.loc[i].reset_index(),
        columns = ["region", "deceased_addition"],
        key_on = "properties.iso_3166_2",
        fill_color = "Greys", 
        fill_opacity = 0.7,
        line_opacity = 0.2,
        highlight=True,
        nan_fill_color = "purple",
        legend_name = "Number of deceased with a Covid-19 infection in {} 2020".format(i)).add_to(n_cases)
    deceased_addition_maps[i+"-2020"] = n_cases
for i in list(raw_data["de_corona"][(raw_data["de_corona"]["year"] == "2021")]["month"].unique()):
    n_cases = folium.Map(location = [51.5, 10.7], zoom_start = 6)

    folium.Choropleth(
        geo_data = shape,
        name = "cases",
        data = cases_2021_monthly.loc[i].reset_index(),
        columns = ["region", "deceased_addition"],
        key_on = "properties.iso_3166_2",
        fill_color = "Greys", 
        fill_opacity = 0.7,
        line_opacity = 0.2,
        highlight=True,
        nan_fill_color = "purple",
        legend_name = "Number of deceased with a Covid-19 infection in {} 2021".format(i)).add_to(n_cases)
    deceased_addition_maps[i+"-2021"] = n_cases    

All the monthly maps can be ascessed by doing 
```python
deceased_addition_maps["Jan-2020"]
confirmed_addition_maps["Jan-2020"]
confirmed_addition_ratio_maps["Jan-2020"]
```
Missing data, or there was no recorded data in that region, the color of the region will be purple
Month and year can be changed

In [None]:
confirmed_addition_maps["Jan-2020"]

In [None]:
deceased_addition_maps["Jan-2020"]

In [None]:
confirmed_addition_ratio_maps["Jan-2020"]

In [None]:
fig = plt.figure(figsize=(15,5))
axes = fig.add_axes([0,0,1,1])
axes.bar(raw_data["de_corona"]["date"], raw_data["de_corona"]["confirmed_addition"]);
#axes.set_yscale('log');

# Kata's stuff

In [None]:
raw_data['de_corona'].reset_index(inplace=True)
#df.columns = ['Date','sum']
raw_data['de_corona'].head()
raw_data['de_corona']['month'] = raw_data['de_corona']['date'].dt.strftime('%b')
raw_data['de_corona']['month']

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches((12,4))
sns.boxplot(x='month',y='confirmed_addition',data=raw_data['de_corona'],ax=ax)
plt.show()