In [None]:
!pip install matplotlib
!pip install networkx
!pip install pandas

In [None]:
import json
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
with open('./dati-json/dpc-covid19-ita-province.json') as f:
    covid19_provinces_data = json.load(f)

with open('./dati-json/dpc-covid19-ita-regioni.json') as f:
    covid19_regions_data = json.load(f)

In [None]:
data = []
for province in covid19_provinces_data:
    province_denomination = province['sigla_provincia']
    province_date = province['data']
    # Es. ('FI', '2020-02-24T...', all data on that row)
    # the first 2 values will be used as key
    data.append([province_denomination, province_date, province.values()])

# Sort according to the date
data = sorted(data, key=lambda x: x[0])
# Prepare data for the DataFrame
index = []
province_info = []
for province in data:
    index.append([province[0], province[1]]) # (Name, date)
    province_info.append(province[2]) # (codice_regione, codice_provincia, ...)
index = pd.MultiIndex.from_tuples(index) # Index ranking: 1° City's name, 2° date
# Create finally the DataFrame
dati = pd.DataFrame(province_info, index=index, columns=covid19_provinces_data[0].keys())
# and drop some usless columns
dati = dati.drop(columns=['denominazione_provincia', 'stato', 'codice_provincia', 'note_it', 'note_en', 'codice_regione'])
dati

In [None]:
# group by "sigla_provincia" and sum all the values. Then drop 'lat' and 'long' columns and show
# the top 15 provinces with the highest value on 'totale_casi'
dati.groupby(['sigla_provincia']).sum().drop(columns=['lat', 'long']).nlargest(15, 'totale_casi').plot(kind="bar")
plt.title("Top 20 provinces with most cases") # I don't know why this 'plt.title()' is working
#but the others right below don't

In [None]:
# Get the provinces name (Sigle)
provinces = dati.sigla_provincia.unique()
# Create a DataFrame with total cases per month for each province
cases_per_month = {}

for province in provinces:
    province_info = dati.loc[province]
    # The 'groupby(province_info.data.str[:7]' is used to group the rows according to the month
    # The [:7] is used to slice the first t letters of the 'data' column
    # 2020-05-15T17:00:00 ---> 2020-05
    # And finally sum the 'totale_casi' values grouped by month for each province
    cases_per_month[province] = pd.Series(province_info.groupby(province_info.data.str[:7])['totale_casi'].sum())

# Create the DataFrame using the Series defined in the for loop
cpm = pd.DataFrame(cases_per_month)
cpm.index.name = "" # Works fine even without this line, but there was an empty line that was bothering me
cpm = cpm.T # For a better view
cpm

In [None]:
grid_size = (2,2) # For the 4 month considered
colors = ['g', 'r', 'b', 'y']
fig, axes = plt.subplots(nrows=grid_size[0], ncols=grid_size[1], sharey=True, figsize=(10,10))
cities_with_worst_cases = [] # Store the name of the cities with the worst cases

month = 0
for i in range(grid_size[0]):
    for j in range(grid_size[1]):
        # Cities twith the worst cases in the current month
        worst_cases = cpm.nlargest(5, cpm.columns[month])[cpm.columns[month]]
        worst_cases.plot(ax=axes[i,j], kind='bar', legend=False, color=colors[month])
        axes[i, j].set_title('Number of cases in '+cpm.columns[month]) # Doesn't work
        cities_with_worst_cases.append(worst_cases)
        month += 1

fig.tight_layout()
fig.subplots_adjust(bottom=0.1)
fig.legend(labels=cpm.columns, loc="lower center", ncol=4)

In [None]:
# Create a set of the cities with the worst case
interested_city = list(set([item for sublist in [list(city.index) for city in cities_with_worst_cases] for item in sublist]))


# Collect information about these cities day by day
for i in range(len(interested_city)):
    case_per_day = dati.loc[interested_city[i]]['totale_casi']
    plt.plot(list(range(len(case_per_day))), case_per_day, label=interested_city[i])

plt.ylabel ="number of cases"
plt.plot((15, 15), (0, 10000), 'r-')
plt.text(15, 11000, "Lockdown", color="r")
plt.plot((75, 75), (0, 22000), 'r-')
plt.text(57, 16000, "End Lockdown", color="r")
plt.tight_layout()
plt.legend()


In [None]:
covid19_provinces_data

In [None]:
# For tomorrow maybe

dataframe_data = {}
for region in covid19_regions_data:
    region_denomination = region['denominazione_regione']
    regions_data = pd.Series(region)
    dataframe_data[region_denomination] = regions_data
regions_data_frame = pd.DataFrame(dataframe_data).T
regions_data_frame.drop(columns=['data', 'denominazione_regione', 'stato', 'note_it', 'note_en'])

In [None]:
covid19_regions_data

In [None]:
# CLAU da qui in giù
# showing clean province data
province_data = pd.DataFrame(covid19_provinces_data)
#province.set_index('data', 'denominazione_provincia')
# not showing the "denominazione_provincia"s in updating phase and not useful columns
refined_province_data = province_data[province_data.denominazione_provincia!= "In fase di definizione/aggiornamento"].drop(columns=["long", "lat", "note_it", "note_en", "codice_provincia", "codice_regione", "stato"])
refined_province_data.head()


In [None]:
# setting as index "data" for province_data
refined_province_data_by_data = refined_province_data.set_index("data")
refined_province_data_by_data.head()

In [None]:
# setting as index "denominazione_provincia" for province_data
refined_province_data_by_province = refined_province_data.set_index("denominazione_provincia")
refined_province_data_by_province.head()

In [None]:
# setting as index "data" and "denominazione_provincia"
refined_province_data_by_data_and_province = refined_province_data.set_index(["data", "denominazione_provincia"])
refined_province_data_by_data_and_province.head()

In [None]:
# Grouping on data in order to see how many cases there are day by day
# (Nota: raggruppare per regione per far vedere quanti casi ci sono ad una certa data è inutile, dato che abbiamo
#   questo dato già nel file delle regioni

total_cases_by_date = refined_province_data_by_data_and_province.groupby(['data']).sum()
total_cases_by_date.head()

In [None]:
total_cases_by_date.tail()

In [None]:
# plotting a graph showing day by day the number of total cases in Italy
ax = total_cases_by_date.plot(lw=2, title="Total cases in Italy by date", kind="bar")

ax.set_xlabel("date")

# sampling the dates on the ticks labels (printing a date label every 5 days)
# TODO magari formattare le date in modo che appaiano più leggibili, magari taglianod i minuti e secondi e lasciando
# solo giorno mese anno
xticks = ax.xaxis.get_major_ticks()
for i in range(len(xticks)):
    if i % 5 != 0:
        xticks[i].set_visible(False)

# set font and rotation for date tick labels
plt.gcf().autofmt_xdate()


In [None]:
# showing the average number of cases(in each province) day by day (da vedere se tenere oppure no, il grafico viene
#  della stessa forma di questo sopra)
average_number_cases_by_date = refined_province_data_by_data_and_province.groupby(['data']).mean()
average_number_cases_by_date.head()

In [None]:
# group by "sigla_provincia" and sum all the values. Showing the top 15 provinces with the highest value on
# 'totale_casi'
refined_province_data.groupby(['sigla_provincia']).sum().nlargest(15, 'totale_casi').plot(kind="bar")
plt.title("Top 20 provinces with most cases")