# We're now making the full POI Version of lacounty_covid.json


## Connect to Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import glob
import re
import os
import pandas as pd
import json

from datetime import date, datetime, timedelta

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)

In [None]:
os.chdir('/content/drive/My Drive/safegraph_data/LA_Covid_Cases_Files')

## Functions

In [None]:
def clean_education_file(df):
  """
  Input: dataframe
  Output: dataframe
  Function returns a dataframe for education section with total Covid-19 cases
  """
  df['total_cases'] = df['total_confirmed_staff'] + df['total_confirmed_students']
  return df.loc[:, ['location_name', 'total_cases']]

In [None]:
def clean_residual_congregate_and_acute_file(df):
  """
  Input: dataframe
  Output: dataframe
  Function returns a dataframe for residual congragate and acute care settings section with total Covid-19 cases
  """
  df['total_cases'] = df['number_of_confirmed_staff'] + df['number_of_confirmed_residents']
  return df.loc[:, ['location_name', 'total_cases']]

In [None]:
def clean_non_residential_settings_file(df):
  """
  Input: dataframe
  Output: dataframe
  Function returns a dataframe for non residential settings section with total Covid-19 cases
  """
  if 'total_confirmed_non_staff' in df.columns:
    df['total_cases'] = df['total_confirmed_staff'] + df['total_confirmed_non_staff']
  elif 'total_confirmed_non_staff' not in df.columns:
    df['total_cases'] = df['total_confirmed_staff']
  return df.loc[:, ['location_name', 'total_cases']]

In [None]:
def get_file_list():
  """
  Input: Nothing 
  Output: list
  Function returns list of csv files from the targetted path
  """
  files_list = []
  for file in glob.glob("*.csv"):
    files_list.append(file)
  return files_list

In [None]:
def get_dates(file_list):
  """
  Input: List
  Output: List
  Function returns a list of unique dates from the input list
  """
  dates = []
  for i in file_list:
    dates.append(i.split('_')[0:4][0])
  dates = set(dates)
  dates = list(dates)
  return dates

In [None]:
#### Function created post usc-research meet up

def get_non_cumulative_poi(df):
  """
  Input: dataframe
  Output: list
  Function returns a list of POIs that are not monotonically increasing
  """
  dip_index_list = []
  
  for poi_name in df['location_name'].unique():
    dates = df[df['location_name']==f'{poi_name}'].sort_values('date')
    y = df[df['location_name']==f'{poi_name}'].sort_values('date')
    dates = dates['date']
    result = y['total_cases'].is_monotonic

    if not result:
      dip_index_list.append(poi_name)
  
  return dip_index_list

In [None]:
#### Function created post usc-research meet up

def make_monotonic(poi_df, max_value):
  
  for i in poi_df['total_cases']:
    if (i == max_value):
      max_index = poi_df[poi_df['total_cases']==max_value].index.astype(int)[0]
      poi_df.loc[max_index+1:, ['total_cases']] = max_value
  return poi_df

In [None]:
#### Function created post usc-research meet up

def ensure_monotonicity(poi_vec):
  
  values_df = pd.DataFrame([i for i in poi_vec['total_cases'].values])
  values_df.columns = ['total_cases']
  counter = 0
  loops = values_df.shape[0]

  while counter < loops:
    if counter+1 >= loops:
      break;
    if (values_df.loc[counter].values > values_df.loc[counter+1].values):
      values_df.loc[counter+1, ['total_cases']] = values_df.loc[counter, ['total_cases']]
    counter+=1
  
  output_df = pd.concat([poi_vec.loc[:, ['date', 'location_name']], values_df], axis=1).sort_values('date', ascending=True)

  return output_df

In [None]:
#### Function created post usc-research meet up

def make_poi_cumulative(non_cumulative_poi_df, total_poi_df):
  
  col_names = ['date', 'location_name', 'total_cases']
  output_df = pd.DataFrame(columns=col_names)

  for i in range(0, len(non_cumulative_poi_df)):
    poi_name = non_cumulative_poi_df[i]
    poi_vec = total_poi_df[total_poi_df['location_name']==f'{poi_name}'].sort_values('date')
    max_value = poi_vec[poi_vec['location_name']==f'{poi_name}']['total_cases'].max()
    poi_vec = poi_vec.reset_index(drop=True)

    temp_df = make_monotonic(poi_df=poi_vec, max_value=max_value)
    temp_df = ensure_monotonicity(temp_df)

    output_df = output_df.append(temp_df, ignore_index=True)
  
  return output_df

In [None]:
def expand_missing_POI_data(POI_name, df):
  """
  Input: string, dataframe 
  Output: dataframe or null
  Function accepts the point of interest and the dataframe that contains all the
  points of interest. It then determines whether the POI given has been discontinued 
  from showing through the most recent date in the input dataframe. If it has 
  been discontinued, then the latest data for the given POI is copied to fill
  in those missing dates until it has data through the most recent date.
  """
  
  sub_df = df[df['location_name']==POI_name]
  first_poi_day = sub_df.sort_values('date')['date'].min()
  last_poi_day = sub_df.sort_values('date')['date'].max()
  today=date.today() - timedelta(days=1) # recent data is 1 day behind... 
  todays_date = today.strftime('%m-%d-%Y')
  needed_days = datetime.strptime(todays_date, '%m-%d-%Y').date() - datetime.strptime(last_poi_day, '%m-%d-%Y').date()
  needed_days = needed_days.days

  date_list = []
  last_day_change = datetime.strptime(last_poi_day, '%m-%d-%Y').date()

  if last_poi_day != todays_date:
    last_row = sub_df[sub_df['date']==last_poi_day].copy(deep=True)
    last_row_info=last_row.loc[:, ['location_name', 'total_cases']]
    df = pd.DataFrame()
    for i in range(0, needed_days):
      next_day = last_day_change+timedelta(i+1)
      next_day = next_day.strftime('%m-%d-%Y')
      date_list.append(next_day)
    df = df.append([last_row_info]*needed_days, ignore_index=True)
    date_list = pd.DataFrame(date_list)
    remaining_data_df = pd.concat([date_list, df], axis=1)
    remaining_data_df.columns = ['date', 'location_name', 'total_cases']
    
    return remaining_data_df
  elif last_poi_day == todays_date:
    return 0

In [None]:
files_list = get_file_list()
dates = get_dates(files_list)

In [None]:
year = datetime.now()
year = year.strftime("%Y")

day_df = pd.DataFrame(columns=['date', 'location_name', 'total_cases'])

for day in dates:
  day_data = pd.DataFrame(columns=['location_name', 'total_cases'])
  for f in files_list:
    if f.split('_')[0:4][0] == day:
      if re.search('.Non-Residential.', f):
        f_data = pd.read_csv(f)
        get_non_res_data = clean_non_residential_settings_file(f_data)
        day_data = day_data.append(get_non_res_data)
      if re.search('.Educational.', f):
        f_data = pd.read_csv(f)
        get_educ_data = clean_education_file(f_data)
        day_data = day_data.append(get_educ_data)
      if re.search('.Residual_Congregate.', f):
        f_data = pd.read_csv(f)
        get_residual_data = clean_residual_congregate_and_acute_file(f_data)
        day_data = day_data.append(get_residual_data)
  day_data['date'] = day[0]+day[1]+'-'+day[2]+day[3]+f'-{year}'
  ready_day_data = day_data.reindex(columns = ['date', 'location_name', 'total_cases'])
  day_df = day_df.append(ready_day_data)

#### Per usc-research meet up: Ensure each POI is cumulative


In [None]:
# obtain Non-Monotonic POIs
non_cumulative_poi_df = get_non_cumulative_poi(day_df)

In [None]:
# Separate Cumulative and Non-Cumulative POIs
cumulative_poi_df = day_df[~day_df.location_name.isin(non_cumulative_poi_df)].sort_values(['location_name', 'date'])

In [None]:
# Make non-cumulative pois monotonic
corrected_poi_df = make_poi_cumulative(non_cumulative_poi_df = non_cumulative_poi_df, total_poi_df = day_df)

In [None]:
# Produce final dataframe 
final_poi_df = pd.concat([cumulative_poi_df, corrected_poi_df], axis=0).sort_values(['location_name', 'date'])

del non_cumulative_poi_df, cumulative_poi_df, corrected_poi_df

#### Fix POIs with missing data

In [None]:
pois = final_poi_df['location_name'].unique()
pois = list(pois)

poi_names_missing_df = []

for poi in pois:
  poi = str(poi)
  result_df = expand_missing_POI_data(POI_name = poi, df=final_poi_df)
  # print(result_df)
  if result_df is not 0:
    # print(result_df)
    # track which pois need updating
    poi_names_missing_df.append(poi)
    # extract the existing poi data
    poi_existing_df = final_poi_df[final_poi_df['location_name']==poi]
    # drop the extracted data from main df, to prevent duplicating 
    final_poi_df = final_poi_df.drop(final_poi_df[final_poi_df['location_name']==poi].index)
    # merge the extracted data with the filling data
    new_entry_df = pd.concat([poi_existing_df, result_df]).reset_index(drop=True)
    # add this merged data to the main df
    final_poi_df = final_poi_df.append(new_entry_df, ignore_index=True)

#### Monotonicity

In [None]:

final_poi_df = final_poi_df.reset_index(drop=True)

test_dict = {}
count = 16

for i in set(final_poi_df['date']):
  day_list = []
  for x in range(0, len(final_poi_df)):
    if final_poi_df.loc[x, ['date']].values == i:
      day_list.append([final_poi_df.loc[x, ['location_name', 'total_cases']].values[0], 
                       final_poi_df.loc[x, ['location_name', 'total_cases']].values[1]])
                      #  str(final_poi_df.loc[x, ['location_name', 'total_cases']].values[1])])
  to_string = str(count)
  test_dict[to_string] = day_list
  count+=1

## Export data and save

In [None]:
os.chdir('/content/drive/My Drive/safegraph_data/')

In [None]:
with open("lacounty_covid_poi.json", "w") as write_file:
  json.dump(test_dict, write_file)

## We're now making the a single POI Version of lacounty_covid.json for testing purposes


In [None]:
# single_poi_test = {}
# count = 16
# POI_sample = "Temple Park Convalescent Hospital"

# for i in final_poi_df['date'].unique():
#   df = final_poi_df[final_poi_df['date']==i]
#   df = df.reset_index(drop=True)
#   length = df[df['date']==i].shape[0]

#   for row in range(0, length):
#     item = df.loc[row, :].values
#     if (re.search(POI_sample, item[1])):
#       to_string = str(count)
#       single_poi_test[to_string] = [[item[1], str(item[2])]]
#       count+=1

In [None]:
# os.chdir('/content/drive/My Drive/safegraph_data/SINGLE_POI_SAMPLE')

# with open('single_poi.json', 'w') as outfile:
#   json.dump(single_poi_test, outfile)

In [None]:
# # final_poi_df.sort_values('total_cases').tail(500)
# # final_poi_df.groupby(['location_name']).sum()
# # final_poi_df.loc[7800:8000, :] # Holiday Manor Care Center
# len(final_poi_df)
# # poi_names_missing_df
# # final_poi_df[final_poi_df['location_name']=='Astoria Nursing and Rehab Center']