In [18]:
import numpy as np
from pathlib import Path
import pandas as pd
import kmapper
from sklearn import datasets
from datetime import date, timedelta, datetime
from sklearn.preprocessing import normalize
import networkx as nx
import IPython
import tkinter

In [19]:
data_folder = Path("csse_covid_19_data/csse_covid_19_daily_reports/")

# make days array based on inputs
pandemic_start = date(2020, 1, 22)

# don't do 3-21 or earlier
start_date = date(2020, 3, 22)
end_date = date(2020, 4, 19)

delete_location = True
delete_unassigned = True
normalize_data = True

delta = end_date - start_date

In [20]:
for day in range(delta.days + 1):
    # get the day
    date = start_date + timedelta(days=day)
    # convert to string/file name
    day_file = date.strftime("%m-%d-%Y") + ".csv"

    file_to_open = data_folder / day_file
    raw_data = pd.read_csv(file_to_open, header=0, delimiter=',', encoding=None, usecols=(
        "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "Confirmed"))

    # calculating days since covid hit US
    days_since_start = date - pandemic_start

    # adding another column with days since covid hit the US
    raw_data['Days since start'] = days_since_start.days  # number of days
    raw_data['Date'] = date.strftime("%m-%d-%Y")
    raw_data = raw_data[raw_data['Lat'].notna()] #drop rows where Lat is empty
    raw_data = raw_data[raw_data['Long_'].notna()] #drop rows where Long_ is empty
    raw_data = raw_data.fillna(value = "N/A")
    #raw_data = raw_data.dropna()  # be careful b/c maybe too early
    np_raw_data = raw_data.to_numpy()

    rows_to_delete = []

    counter = 0
    for i in np_raw_data:
        if i[2] != "US":  # hardcoded index for checking state
            rows_to_delete.append(counter)
        else:
            if delete_unassigned:  # delete unassigned row if need be
                if i[0] == "Unassigned":  # hardcoded index for checking county
                    rows_to_delete.append(counter)
                elif "Out of" in str(i[0]):
                    rows_to_delete.append(counter)
        counter = counter + 1

    if day == 0:
        # 0 refers to deleting the rows
        final_array = np.delete(np_raw_data, rows_to_delete, 0)
    else:
        # 0 refers to deleting the rows
        selected_data = np.delete(np_raw_data, rows_to_delete, 0)
        final_array = np.vstack((final_array, selected_data))

In [21]:
if delete_location:
    #matrix: "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "Confirmed", "days since start", "date"
    
    indices = np.empty(len(final_array), dtype=object)
    for i in range(len(final_array)):
        indices[i] = str(final_array[i, 0]) + ", " + str(final_array[i, 1]) + ", " + str(final_array[i, 7])

    print(indices)
    selected_data1 = np.delete(final_array, [0, 1, 2, 7], 1)  # deleting locations
    
    if normalize_data:            
        data = normalize(selected_data1, axis=0, norm='l2')
    else:
        data = selected_data1.copy()
else:
    data = final_array.copy()

['New York City, New York, 03-22-2020' 'Nassau, New York, 03-22-2020'
 'Westchester, New York, 03-22-2020' ... 'N/A, Puerto Rico, 04-19-2020'
 'N/A, Recovered, 04-19-2020' 'N/A, Virgin Islands, 04-19-2020']


ValueError: could not convert string to float: 'N/A'

In [22]:
km = kmapper.KeplerMapper()
lens = km.project(data)
graph = km.map(X=data, lens=lens, cover=kmapper.Cover(n_cubes=1000, perc_overlap=0.08))

In [11]:
# nx_graph = kmapper.adapter.to_nx(graph)


url = 'make_circles_keplermapper_output.html'


km.visualize(graph,
                 path_html=url,
                 title="COVID-19 Dataset", custom_tooltips = indices)

iframe = '<iframe src=' + url + ' width=1000 height=800></iframe>'
IPython.display.HTML(iframe)


# import matplotlib.pyplot as plt
# %matplotlib inline
# #matplotlib.use('TKAgg',warn=False, force=True)
# kmapper.draw_matplotlib(graph)
# plt.show()



In [12]:
# nx.draw(nx_graph)

In [13]:
print(data)

[[ 3.76013454e-03 -2.85329321e-03  2.01181608e-02  2.87918266e-03]
 [ 3.75768043e-03 -2.83855423e-03  3.95944744e-03  2.87918266e-03]
 [ 3.79661417e-03 -2.84503438e-03  3.90318161e-03  2.87918266e-03]
 ...
 [ 3.68968571e-03 -3.95085847e-03  6.25175912e-06  4.22280124e-03]
 [ 2.49046616e-03 -3.82526541e-03  1.25035182e-05  4.22280124e-03]
 [ 2.66244673e-03 -3.84804796e-03  2.08391971e-06  4.22280124e-03]]
