# Training on Mobility Data
In this notebook we will train models on the mobility data.

In [0]:
import os
import pandas as pd
from google.colab import auth
from datetime import datetime
auth.authenticate_user()
!wandb login
!gcloud source repos clone github_aistream-peelout_flow-forecast --project=gmap-997
os.chdir('/content/github_aistream-peelout_flow-forecast')
!python setup.py develop
!pip install -r requirements.txt
!mkdir data
from flood_forecast.trainer import train_function
!pip install git+https://github.com/CoronaWhy/task-geo.git
!pip install git+https://github.com/coronawhy/task-ts
import wandb

/bin/bash: wandb: command not found
$ git clone https://github.com/AIStream-Peelout/flow-forecast
Cloning into '/content/github_aistream-peelout_flow-forecast'...
remote: Total 3943 (delta 2553), reused 3943 (delta 2553)[K
Receiving objects: 100% (3943/3943), 2.71 MiB | 15.67 MiB/s, done.
Resolving deltas: 100% (2553/2553), done.
Project [gmap-997] repository [github_aistream-peelout_flow-forecast] was cloned to [/content/github_aistream-peelout_flow-forecast].
running develop
running egg_info
creating flood_forecast.egg-info
writing flood_forecast.egg-info/PKG-INFO
writing dependency_links to flood_forecast.egg-info/dependency_links.txt
writing requirements to flood_forecast.egg-info/requires.txt
writing top-level names to flood_forecast.egg-info/top_level.txt
writing manifest file 'flood_forecast.egg-info/SOURCES.txt'
package init file 'flood_forecast/__init__.py' not found (or not a regular file)
package init file 'flood_forecast/transformer_xl/__init__.py' not found (or not a regu

Now that we have all the basic things setup lets define our config file. Since mobility data has a longer lag we will need to define our parameter sweeps to have a longer lookback window.

In [0]:
def make_config_file(file_path, df_len, weight_path=None):
  run = wandb.init(project="covid_forecast", entity="covid")
  wandb_config = wandb.config
  train_number = df_len * .7
  validation_number = df_len *.9
  config_default={                 
    "model_name": "CustomTransformerDecoder",
    "model_type": "PyTorch",
    "model_params": {
        "seq_length":wandb_config["forecast_history"],
        "n_time_series":9,
        "output_seq_length":wandb_config["out_seq_length"],
        "n_layers_encoder": wandb_config["number_encoder_layers"],
        "use_mask": wandb_config["use_mask"]
    },
    "dataset_params":
    {  "class": "default",
       "training_path": file_path,
       "validation_path": file_path,
       "test_path": file_path,
       "batch_size":wandb_config["batch_size"],
       "forecast_history":wandb_config["forecast_history"],
       "forecast_length":wandb_config["out_seq_length"],
       "train_end": int(train_number),
       "valid_start":int(train_number+1),
       "valid_end": int(validation_number),
       "target_col": ["new_cases"],
       "relevant_cols": ["new_cases", "month", "weekday", "mobility_retail_recreation",	"mobility_grocery_pharmacy",	"mobility_parks",	"mobility_transit_stations",	"mobility_workplaces",	"mobility_residential"],
       "scaler": "StandardScaler", 
       "interpolate": False
    },
    "training_params":
    {
       "criterion":"MSE",
       "optimizer": "Adam",
       "optim_params":
       {

       },
       "lr": wandb_config["lr"],
       "epochs": 10,
       "batch_size":wandb_config["batch_size"]
    
    },
    "GCS": True,
    "early_stopping":
    {
        "patience":3
    },
    "sweep":True,
    "wandb":False,
    "forward_params":{},
   "metrics":["MSE"],
   "inference_params":
   {     
         "datetime_start":"2020-04-21",
          "hours_to_forecast":10, 
          "test_csv_path":file_path,
          "decoder_params":{
              "decoder_function": "simple_decode", 
            "unsqueeze_dim": 1
          },
          "dataset_params":{
             "file_path": file_path,
             "forecast_history":wandb_config["forecast_history"],
             "forecast_length":wandb_config["out_seq_length"],
             "relevant_cols": ["new_cases", "month", "weekday", "mobility_retail_recreation",	"mobility_grocery_pharmacy",	"mobility_parks",	"mobility_transit_stations",	"mobility_workplaces",	"mobility_residential"],
             "target_col": ["new_cases"],
             "scaling": "StandardScaler",
             "interpolate_param": False
          }
      }, 
      "weight_path_add":{
      "excluded_layers":["out_length_lay.weight", "out_length_lay.bias", "dense_shape.weight", "dense_shape.bias"]
      }
  }
  if weight_path: 
    config_default["weight_path"] = weight_path
  wandb.config.update(config_default)
  return config_default

wandb_sweep_config_full = {
  "name": "Default sweep",
  "method": "grid",
  "parameters": {
        "batch_size": {
            "values": [2, 5, 10, 20]
        },
        "lr":{
            "values":[0.001, 0.002, 0.0001, .01]
        },
        "forecast_history":{
            "values":[10, 11, 15]
        },
        "out_seq_length":{
            "values":[1, 2, 3, 4, 5]
        },
        "number_encoder_layers":
        {
            "values":[1, 2, 3]
        },
        "use_mask":{
            "values":[True, False]
        }
    }
}

In [0]:
os.environ['MODEL_BUCKET'] = "coronaviruspublicdata"
os.environ["ENVIRONMENT_GCP"] = "Colab"
os.environ["GCP_PROJECT"] = "gmap-997"
!gsutil cp gs://predict_cfs/experiments/25_May_202010_29PM_model.pth .
!gsutil cp gs://coronaviruspublicdata/experiments/26_May_202012_59AM_model.pth .		

Copying gs://predict_cfs/experiments/25_May_202010_29PM_model.pth...
/ [1 files][  4.7 MiB/  4.7 MiB]                                                
Operation completed over 1 objects/4.7 MiB.                                      
Copying gs://coronaviruspublicdata/experiments/26_May_202012_59AM_model.pth...
/ [1 files][  2.7 MiB/  2.7 MiB]                                                
Operation completed over 1 objects/2.7 MiB.                                      


## Loading the Data


In [0]:
import glob
from corona_ts.data_utils.data_crawler import load_data
from corona_ts.data_utils.data_creator import loop_through_locations
from corona_ts.data_utils.data_creator import region_df_format
!mkdir dir /usr/local/lib/python3.6/dist-packages/data
df = load_data()
df['full_county'] = df['region'] + "_" + df['sub_region']
important_cities_list = ["United_States__California__Los_Angeles_County", "United_States__Illinois__Cook_County", "United_States__Arizona__Maricopa_County", "United_States__Massachusetts__Middlesex_County", "United_States__Texas__Dallas_County", "United_States__Texas__Harris_County", "United_States__Florida__Miami Dade_County", "United_States__California__Riverside_County", "United_States__Colorado__Denver_County", "United_States__Ohio__Cuyahoga_County", "United_States__New York__Queens_County", "United_States__New York__Bronx_County"]

mkdir: cannot create directory ‘dir’: File exists
mkdir: cannot create directory ‘/usr/local/lib/python3.6/dist-packages/data’: File exists


  exec(code_obj, self.user_global_ns, self.user_ns)
  mobility_df = fetch_mobility_data()
2020-05-28 17:04:03.821 | INFO     | corona_ts.data_utils.data_crawler:_treat_mobility_missing_values:121 - Treat mobility missing values.


In [0]:
#df_list = loop_through_locations(df)
df['country'] = df['country'].str.replace(" ","_")
df['sub_region'] = df['country'] +"__"+df['region'].str.replace(" ", "_") + "__"+df['sub_region'].str.replace(" ", "_")

In [0]:
def loop_special_counties(special_counties_list):
  for county in special_counties_list:
    region = region_df_format(df,county)
    file_path, len_df, file_path2 = format_corona_data(region, county)
    sweep_id = wandb.sweep(wandb_sweep_config_full,'covid', 'covid_forecast')
    wandb.agent(sweep_id, lambda: train_function("PyTorch", make_config_file(file_path2, len(region), weight_path="25_May_202010_29PM_model.pth")))
loop_special_counties(important_cities_list)

In [0]:
df.tail()

Unnamed: 0,level,country,region,sub_region,date,lat,long,cases,deaths,recovered,active,tested,hospitalized,discharged,mobility_retail_recreation,mobility_grocery_pharmacy,mobility_parks,mobility_transit_stations,mobility_workplaces,mobility_residential,full_county
164400,sub_region,United States,North Dakota,United States__North Dakota__Grand Forks County,2020-03-29,47.9335,-97.3975,0,0,0,0,400,0,0,-46.0,-20.0,-65.0,-38.0,-28.0,-2.0,North Dakota_Grand Forks County
164401,sub_region,United States,Georgia,United States__Georgia__Wilcox County,2020-04-13,31.989,-83.3945,24,0,0,0,0,0,0,0.0,-6.0,0.0,0.0,-30.0,0.0,Georgia_Wilcox County
164402,sub_region,United States,Colorado,United States__Colorado__Routt County,2020-03-23,40.461,-107.0345,3,0,0,0,0,0,0,-71.0,-34.0,-44.0,-65.0,-51.0,-2.0,Colorado_Routt County
164403,sub_region,United States,California,United States__California__Mono County,2020-04-08,38.088,-118.741,20,1,0,0,103,0,0,-100.0,-68.0,-63.0,-69.0,-57.0,0.0,California_Mono County
164404,sub_region,United States,Florida,United States__Florida__St. Johns County,2020-03-17,29.9375,-81.4515,2,0,0,0,0,0,0,-1.0,25.0,22.0,9.0,-30.0,9.0,Florida_St. Johns County


In [0]:
df[df['sub_region']==['United_States__California__Los_Angeles_County']]

ValueError: ignored

### Pre-training on all counties
In order to make all the weights transferable we will need to redfine our Wandb sweep.

In [0]:
wandb_sweep_config_transfer = {
  "name": "Default sweep",
  "method": "grid",
  "parameters": {
        "batch_size": {
            "values": [2, 3, 4, 5]
        },
        "lr":{
            "values":[0.001, 0.002, 0.004, 0.01]
        },
        "forecast_history":{
            "values":[10, 11, 12]
        },
        "out_seq_length":{
            "values":[1, 2, 3, 4]
        }
    }
}

def get_most_recent_file(file_path):
  list_of_files = glob.glob(file_path+"/*.pth") # * means all if need specific format then *.csv
  if len(list_of_files) > 1:
    latest_file = max(list_of_files, key=os.path.getctime)
    return latest_file
  return None

def format_corona_data(region_df:pd.DataFrame, region_name:str):
  """
  Format data for a specific region into 
  a format that can be used with flow forecast. 
  """
  if region_name == 'county':
    region_name = region_df['full_county'].iloc[0]
  elif region_name=='state':
    region_name = region_df['state'].iloc[0]
  #else:
    #region_name = region_df['country'].iloc[0]
  region_df['month'] = pd.to_datetime(region_df['date']).map(lambda x: x.month)
  d = pd.to_datetime(region_df['date'])
  region_df['weekday'] = d.map(lambda x: x.weekday())
  region_df['datetime'] = region_df.date
  region_df.index = region_df.date
  region_df = region_df.sort_index()
  region_df = region_df.fillna(0)
  region_df['new_cases'] = region_df['cases'].diff()
  region_df.iloc[0]['new_cases'] = 0
  region_df= region_df.fillna(0)
  region_df.to_csv(region_name+".csv")
  print(region_df.head(9))
  return region_df, len(region_df), region_name+".csv"


def run_full_geo_code(df_list, start_index, end_index, use_transfer=True):
  for i in range(start_index, end_index):
    file_path, len_df, file_path_name = format_corona_data(df_list[i], 'county')
    latest_file = get_most_recent_file("model_save")
    sweep_full = wandb.sweep(wandb_sweep_config_transfer, project="covid_forecast", entity='covid')
    if use_transfer and len(os.listdir("model_save"))>1:
      print("using transfer")
      wandb.agent(sweep_full, lambda:train_function("PyTorch", make_config_file(file_path_name, len_df, weight_path=latest_file)))
    else:
      wandb.agent(sweep_full, lambda:train_function("PyTorch", make_config_file(file_path_name, len_df)))

The purpose of this experiment is mainly to tune the MultiHead weights

## Using fine-tuned weights