In [46]:
import numpy as np
from pathlib import Path
import pandas as pd
import kmapper
from sklearn import datasets
from datetime import date, timedelta, datetime
from sklearn.preprocessing import normalize
import networkx as nx
import IPython
import tkinter

In [47]:
data_folder = Path("csse_covid_19_data/csse_covid_19_daily_reports/")

# make days array based on inputs
pandemic_start = date(2020, 1, 22)

# don't do 3-21 or earlier
start_date = date(2020, 3, 22)
end_date = date(2020, 4, 19)

delete_location = True
delete_unassigned = True
normalize_data = True
sort_by_location = False

delta = end_date - start_date

In [48]:
for day in range(delta.days + 1):
    # get the day
    date = start_date + timedelta(days=day)
    # convert to string/file name
    day_file = date.strftime("%m-%d-%Y") + ".csv"

    file_to_open = data_folder / day_file
    raw_data = pd.read_csv(file_to_open, header=0, delimiter=',', encoding=None, usecols=(
        "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "Confirmed"))

    # calculating days since covid hit US
    days_since_start = date - pandemic_start

    # adding another column with days since covid hit the US
    raw_data['Days since start'] = days_since_start.days  # number of days
    raw_data['Date'] = date.strftime("%m-%d-%Y")
    raw_data = raw_data[raw_data['Lat'].notna()] #drop rows where Lat is empty
    raw_data = raw_data[raw_data['Long_'].notna()] #drop rows where Long_ is empty
    raw_data = raw_data.fillna(value = "N/A")
    
    if day == 0:
        final_array = raw_data.copy(deep=False)  # make shallow copy
    else:
        final_array = final_array.append(raw_data)
        #print(final_array.iloc[-1])

In [49]:
final_array.drop(
    final_array[final_array['Country_Region'] != "US"].index, inplace=True)

if delete_unassigned:
    final_array.drop(
        final_array[final_array['Admin2'] == "Unassigned"].index, inplace=True)
    final_array.drop(
        final_array[final_array.Admin2.str.contains("Out of")].index, inplace=True)

if sort_by_location:
    final_array = final_array.sort_values(['Province_State', 'Admin2'])

if delete_location:
    #matrix: "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "Confirmed", "days since start", "date"
    temp = final_array.to_numpy()
    indices = np.empty(len(temp), dtype=object)
    for i in range(len(temp)):
        indices[i] = str(temp[i, 0]) + ", " + str(temp[i, 1]) + ", " + str(temp[i, 7])

    print(indices)
    final_array = final_array.drop(
        columns=["Admin2", "Province_State", "Country_Region", "Date"])
    final_array = final_array.to_numpy()
    
    if normalize_data: #normalize the columns (axis = 0)            
        data = normalize(final_array, axis=0, norm='l2')
    else:
        data = final_array.copy()
else:
    data = final_array.copy()

['New York City, New York, 03-22-2020' 'Nassau, New York, 03-22-2020'
 'Westchester, New York, 03-22-2020' ... 'Salt Lake, Utah, 04-19-2020'
 'San Jacinto, Texas, 04-19-2020' 'San Juan, Utah, 04-19-2020']


In [50]:
km = kmapper.KeplerMapper()
lens = km.project(data)
graph = km.map(X=data, lens=lens, cover=kmapper.Cover(n_cubes=100, perc_overlap=0.08))

In [51]:
# nx_graph = kmapper.adapter.to_nx(graph)


url = 'make_circles_keplermapper_output.html'


km.visualize(graph,
                 path_html=url,
                 title="COVID-19 Dataset", custom_tooltips = indices)

iframe = '<iframe src=' + url + ' width=1000 height=800></iframe>'
IPython.display.HTML(iframe)


# import matplotlib.pyplot as plt
# %matplotlib inline
# #matplotlib.use('TKAgg',warn=False, force=True)
# kmapper.draw_matplotlib(graph)
# plt.show()



In [52]:
# nx.draw(nx_graph)

In [53]:
print(data)

[[ 4.28344558e-03 -3.25298871e-03  2.10763968e-02  3.26363909e-03]
 [ 4.28064992e-03 -3.23618506e-03  4.14803749e-03  3.26363909e-03]
 [ 4.32500221e-03 -3.24357297e-03  4.08909170e-03  3.26363909e-03]
 ...
 [ 4.27282217e-03 -4.92188985e-03  3.52583187e-03  4.78667067e-03]
 [ 3.21299211e-03 -4.18489233e-03  2.18317763e-05  4.78667067e-03]
 [ 3.95356113e-03 -4.82873810e-03  4.80299078e-05  4.78667067e-03]]
