# Group 10 - First Year Project
### Data Science at ITU 
## Project 2 - Correlation between Covid-19's spread and weather conditions

#Description

This notebook contains all the code developed in the Project 2 - Correlation between Covid-19's spread and weather conditions

Contact/Group: 
- Florian Micliuc(flmi@itu.dk)
- Louis Caspar Brandt (locb@itu.dk)
- Iben Mai Huse (ibhu@itu.dk)
- Katalin Literati-Dobos (klit@itu.dk)
- Ruben Jonsman (rubj@itu.dk)

#  Library imports

In [None]:
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import math
from collections import Counter
from scipy.stats import chi2_contingency
import seaborn as sns
import folium
from folium import plugins 
from folium.plugins import HeatMap, MarkerCluster
import shapely
import json 
from shapely.geometry import Point, MultiPoint, LineString, MultiLineString, Polygon, MultiPolygon
import branca
import branca.colormap as cm

# Functions

In [None]:
def check_null_values(file):
    if raw_data[file].isnull().values.any():
        print('There are null values in the dataset')
    else:
        print('There are no null values in the dataset')

In [None]:
def dataset_checker_values(dataset,value):
    SA = dataset.copy()
    SA.replace(value, np.nan, inplace=True)
    missingdata_df = SA.columns[SA.isnull().any()].tolist()
    msno.matrix(SA);

# Task 0 - Data cleaning and filtering

## Loading data
#Description

In [None]:
PATH = {}
PATH["data_raw"] = "../data/raw/"
PATH["data_interim"] = "../data/interim/"
PATH["data_processed"] = "../data/processed/"
PATH["data_external"] = "../data/external/"

SUBPATH = {}
SUBPATH["corona"] = "corona/"
SUBPATH["meta"] = "metadata/"
SUBPATH["shape"] = "shapefiles/"
SUBPATH["weather"] = "weather/"

FILENAME = {}
FILENAME["corona"] = "de_corona.csv"
FILENAME["meta"] = "de_metadata.json"
FILENAME["shape"] = "de.geojson"
FILENAME["weather"] = "weather.csv"

corona = PATH["data_raw"]+SUBPATH["corona"]+FILENAME["corona"]
meta = PATH["data_raw"]+SUBPATH["meta"]+FILENAME["meta"]
shape = PATH["data_raw"]+SUBPATH["shape"]+FILENAME["shape"]
weather = PATH["data_raw"]+SUBPATH["weather"]+FILENAME["weather"]

In [None]:
raw_data = {}

from datetime import datetime
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')

raw_data['de_corona'] = pd.read_csv(corona, sep='\s+', parse_dates=['date'], date_parser=dateparse)
raw_data['weather'] = pd.read_csv(weather, sep='\s+', parse_dates=['date'], date_parser=dateparse)

In [None]:
HEADERS = ['de_corona','weather']
for i in HEADERS:
    print(i +"  \tDataFrame shape: " + str(raw_data[i].shape))

 ### Data type insight
 #Description

# Filtering the weather dataset

In [None]:
weather_data_de = raw_data["weather"][raw_data["weather"]["iso3166-2"].str.startswith("DE")]

# Relational data table
#Description

# Sanity check
#Description

### Check for null values

In [None]:
check_null_values('de_corona')
check_null_values('weather') #checking in the whole dataset, will change

### Checking and visualizing for weird values

In [None]:
dataset_checker_values(weather_data_de,-999) #-999

In [None]:
dataset_checker_values(weather_data_de,-1) #-1

In [None]:
dataset_checker_values(raw_data['de_corona'],-999) #-999

In [None]:
dataset_checker_values(raw_data['de_corona'],-1) #-1

# Checking for duplicates, should we do that?

# Kata's stuff

In [None]:
raw_data['de_corona'].reset_index(inplace=True)
#df.columns = ['Date','sum']
raw_data['de_corona'].head()
raw_data['de_corona']['month'] = raw_data['de_corona']['date'].dt.strftime('%b')
raw_data['de_corona']['month']

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches((12,4))
sns.boxplot(x='month',y='confirmed_addition',data=raw_data['de_corona'],ax=ax)
plt.show()