DATA SOURCES:
    
https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/
    
https://www.kaggle.com/muonneutrino/us-census-demographic-data?select=acs2015_county_data.csv
    
https://www.census.gov/library/publications/2011/compendia/usa-counties-2011.html#LND
    
https://www.census.gov/library/publications/2011/compendia/usa-counties-2011/basic-info-file-formats.html#part09
    
https://www.census.gov/population/www/censusdata/density.html


In [1]:
#embeddings work better without population density
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import LabelEncoder, RobustScaler, MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

import datetime
from collections import OrderedDict
import difflib 
import umap

def get_count_on_nth_day(df_series,nth_day):
    """
    Returns number of new cases (cumulative) on nth day
    
    INPUTS:
    df_series -> series to apply the fn to
    nth_day -> nth day since first occurance 
    """
    cumulative_count = df_series.cumsum()
    if cumulative_count.sum()>0:
        first_case_idx = cumulative_count[cumulative_count>0].index[0]
        nth_day_idx = first_case_idx+nth_day
        if nth_day_idx<=cumulative_count.index[-1]:
            cases_on_nth_day = cumulative_count.iloc[cumulative_count.index==nth_day_idx].values[0]
        else:
            cases_on_nth_day = cumulative_count.iloc[-1]
    else:
        cases_on_nth_day = 0
        
    return cases_on_nth_day

"""
based on :

https://yashuseth.blog/2018/07/22/pytorch-neural-network-for-tabular-data-with-categorical-embeddings/
"""

class TabularDataset(Dataset):
  def __init__(self, data, cat_cols=None, output_col=None):
    """
    Characterizes a Dataset for PyTorch

    Parameters
    ----------

    data: pandas data frame
      The data frame object for the input data. It must
      contain all the continuous, categorical and the
      output columns to be used.

    cat_cols: List of strings
      The names of the categorical columns in the data.
      These columns will be passed through the embedding
      layers in the model. These columns must be
      label encoded beforehand. 

    output_col: string
      The name of the output variable column in the data
      provided.
    """

    self.n = data.shape[0]

    if output_col:
      self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
    else:
      self.y =  np.zeros((self.n, 1))

    self.cat_cols = cat_cols if cat_cols else []
    self.cont_cols = [col for col in data.columns
                      if col not in self.cat_cols + [output_col]]

    if self.cont_cols:
      self.cont_X = data[self.cont_cols].astype(np.float32).values
    else:
      self.cont_X = np.zeros((self.n, 1))

    if self.cat_cols:
      self.cat_X = data[cat_cols].astype(np.int64).values
    else:
      self.cat_X =  np.zeros((self.n, 1))

  def __len__(self):
    """
    Denotes the total number of samples.
    """
    return self.n

  def __getitem__(self, idx):
    """
    Generates one sample of data.
    """
    return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]


class FeedForwardNN(nn.Module):

  def __init__(self, emb_dims, no_of_cont, lin_layer_sizes,
               output_size, emb_dropout, lin_layer_dropouts):

    """
    Parameters
    ----------

    emb_dims: List of two element tuples
      This list will contain a two element tuple for each
      categorical feature. The first element of a tuple will
      denote the number of unique values of the categorical
      feature. The second element will denote the embedding
      dimension to be used for that feature.

    no_of_cont: Integer
      The number of continuous features in the data.

    lin_layer_sizes: List of integers.
      The size of each linear layer. The length will be equal
      to the total number
      of linear layers in the network.

    output_size: Integer
      The size of the final output.

    emb_dropout: Float
      The dropout to be used after the embedding layers.

    lin_layer_dropouts: List of floats
      The dropouts to be used after each linear layer.
    """

    super().__init__()

    # Embedding layers
    self.emb_layers = nn.ModuleList([nn.Embedding(x, y)
                                     for x, y in emb_dims])

    no_of_embs = sum([y for x, y in emb_dims])
    self.no_of_embs = no_of_embs
    self.no_of_cont = no_of_cont

    # Linear Layers
    first_lin_layer = nn.Linear(self.no_of_embs + self.no_of_cont,
                                lin_layer_sizes[0])

    self.lin_layers =\
     nn.ModuleList([first_lin_layer] +\
          [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1])
           for i in range(len(lin_layer_sizes) - 1)])
    
    for lin_layer in self.lin_layers:
      nn.init.kaiming_normal_(lin_layer.weight.data)

    # Output Layer
    self.output_layer = nn.Linear(lin_layer_sizes[-1],
                                  output_size)
    nn.init.kaiming_normal_(self.output_layer.weight.data)

    # Batch Norm Layers
    self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
    self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size)
                                    for size in lin_layer_sizes])

    # Dropout Layers
    self.emb_dropout_layer = nn.Dropout(emb_dropout)
    self.droput_layers = nn.ModuleList([nn.Dropout(size)
                                  for size in lin_layer_dropouts])

  def forward(self, cont_data, cat_data):

    if self.no_of_embs != 0:
      x = [emb_layer(cat_data[:, i])
           for i,emb_layer in enumerate(self.emb_layers)]
      x = torch.cat(x, 1)
      x = self.emb_dropout_layer(x)

    if self.no_of_cont != 0:
      normalized_cont_data = self.first_bn_layer(cont_data)

      if self.no_of_embs != 0:
        x = torch.cat([x, normalized_cont_data], 1) 
      else:
        x = normalized_cont_data

    for lin_layer, dropout_layer, bn_layer in\
        zip(self.lin_layers, self.droput_layers, self.bn_layers):
      
      x = F.relu(lin_layer(x))
      x = bn_layer(x)
      x = dropout_layer(x)

    x = self.output_layer(x)

    return x

def get_county_from_df(county, state, df):
    return df.query("County=='" + county + "' and State=='" +state+"'")

def get_most_similar_counties(target_embed, test_df, target_name):
    test_df = test_df.dropna()
    test_df[target_name] = test_df['embeddings'].map(lambda x: cosine_similarity(x.reshape(1,-1), target_embed.reshape(1,-1)))
    return test_df.sort_values(by=target_name)

data_dir=r'C:\Users\Cafral\Desktop\kaggle\team_ts_forecast\embeddings\data'

# Loading and creating the dataset

In [2]:
# census data
census_data = pd.read_csv(data_dir+r'\county_data\datasets_7001_312628_acs2015_county_data.csv')

#land area
land_area = pd.read_excel(data_dir+r'\county_data\LND01.xls',
                          usecols=['STCOU','LND010190D'],)
land_area.columns = ['CensusId','area']

#merging the data
census_data = pd.merge(census_data,land_area,on=['CensusId'])

#compute population density
census_data['pop_density'] = census_data['TotalPop']/census_data['area']

census_data['State County'] = census_data['State'] +" "+census_data['County']

#task ts covid data
task_ts_covid_data = pd.read_csv(data_dir+r'\ts_covid_data\2020-06-26_corona_data.csv')

#keeping only data for the us
task_ts_covid_data = task_ts_covid_data[task_ts_covid_data['country']=='United States']

#keeping only sub region data
task_ts_covid_data = task_ts_covid_data[task_ts_covid_data['level']=='sub_region']

task_ts_covid_data.sort_values(['region','sub_region','date'],inplace=True)
task_ts_covid_data.reset_index(inplace=True,drop=True)

#creating lat-log data
lat_long_data = task_ts_covid_data[['region','sub_region','lat','long']].drop_duplicates()

In [3]:
#get count on nth day since reporting of first instance
nth_day = 30
column_name = 'cases'
grouped_covid_data = task_ts_covid_data.groupby(['region','sub_region'])[column_name].apply(get_count_on_nth_day,nth_day=nth_day).reset_index()
grouped_covid_data.rename(columns={f'{column_name}':f'{column_name}_on_day{nth_day}'},inplace=True)

In [4]:
grouped_covid_data = pd.merge(grouped_covid_data,lat_long_data,on=['region','sub_region'])

In [5]:
grouped_covid_data['sub_region'] = grouped_covid_data['sub_region'].str.replace(' County',"")
grouped_covid_data['sub_region'] = grouped_covid_data['sub_region'].str.replace(' Parish',"")
grouped_covid_data['State County'] = grouped_covid_data['region'] +" "+grouped_covid_data['sub_region']

In [6]:
#merging the data
merged_data= pd.merge(census_data,grouped_covid_data[['State County',f'{column_name}_on_day{nth_day}']],
                       right_on=['State County'],left_on=['State County'])
merged_data.dropna(inplace=True)

#disparity in num of counties
print('No. counties (county df):',census_data['State County'].nunique(),'\n'
      'No. counties (covid df):',grouped_covid_data['State County'].nunique(),'\n'
      'No. counties (merged df):',merged_data['State County'].nunique(),'\n'
      'Missing county:',set(grouped_covid_data['State County']) - set(census_data['State County']),
     )

#keeping only relevant data
cat_cols = ['TotalPop', 
            'pop_density',
            'Men', 'Women', 'Hispanic',
                 'White', 'Black', 'Native', 'Asian', 'Pacific',
                 'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
                 'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
                 'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
                 'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
                 'SelfEmployed', 'FamilyWork', 'Unemployment']#'area','pop_density', 

#len(relevant_cols)

target_var = [f'{column_name}_on_day{nth_day}']

train_df =merged_data[cat_cols+target_var]
train_df.replace([np.inf, -np.inf], np.nan,inplace=True)
train_df.dropna(inplace=True)

No. counties (county df): 3140 
No. counties (covid df): 2734 
No. counties (merged df): 2733 
Missing county: {'South Dakota Oglala Lakota'}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Preprocessing the data

In [7]:
#scaling vars
"""

#robust_scaler = RobustScaler()
#train_df = robust_scaler.fit_transform(train_df)
#minmax_scaler = MinMaxScaler()
#train_df = minmax_scaler.fit_transform(train_df)
scaled_independent_vars[target_var] = train_df[target_var]

"""
columns = train_df.columns
std_scaler = RobustScaler()#MinMaxScaler()#StandardScaler()
scaled_independent_vars = std_scaler.fit_transform(train_df)
scaled_independent_vars = pd.DataFrame(scaled_independent_vars)
scaled_independent_vars.columns = columns

In [8]:
#encoding vars
train_df = scaled_independent_vars

In [9]:
label_encoders = {}
for cat_col in cat_cols:
        label_encoders[cat_col] = LabelEncoder()
        train_df[cat_col] = label_encoders[cat_col].fit_transform(train_df[cat_col])

In [10]:
#training data for visualization
train_df_viz = train_df.merge(merged_data[['State','County']],
                              left_on=[train_df.index],
                              right_on=[merged_data.index])

train_df_viz =train_df_viz[train_df_viz.columns[1:]]
train_df_viz

Unnamed: 0,TotalPop,pop_density,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,...,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,cases_on_day30,State,County
0,1804,1866,1731,1775,26,431,178,4,10,0,...,149,1757,169,137,35,0,65,0.266484,Alabama,Autauga
1,2370,1902,2316,2335,45,504,95,6,7,0,...,148,2311,248,51,38,4,64,0.531602,Alabama,Baldwin
2,1192,880,1219,1093,46,162,385,2,4,0,...,125,921,151,136,53,1,161,0.452340,Alabama,Barbour
3,1027,1049,1053,937,22,418,203,4,1,0,...,172,887,201,89,47,4,72,0.486505,Alabama,Bibb
4,1830,1848,1775,1791,86,552,15,3,1,0,...,232,1705,253,63,22,4,66,0.148958,Alabama,Blount
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2727,1642,90,1636,1564,157,472,8,2,9,5,...,94,1727,219,105,17,0,43,-0.069696,Wyoming,Sublette
2728,1016,121,1005,960,150,485,2,4,19,0,...,28,1339,246,40,55,0,15,1.127434,Wyoming,Sweetwater
2729,955,271,934,917,89,551,2,4,1,0,...,88,1074,187,118,36,0,33,-0.196789,Wyoming,Teton
2730,218,72,208,231,139,497,7,3,2,0,...,9,291,138,143,56,5,57,-0.159891,Wyoming,Uinta


# Model

In [11]:
torch.manual_seed(0)
np.random.seed(0)

#building the model
dataset = TabularDataset(data=train_df, cat_cols=cat_cols,output_col=target_var)

batchsize = 10 #10
dataloader = DataLoader(dataset, batchsize, shuffle=True,)

cat_dims = [int(train_df[col].nunique()) for col in cat_cols]
emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]

model = FeedForwardNN(emb_dims, 
                      no_of_cont=0, 
                      lin_layer_sizes=[2, 1],
                      output_size=1, 
                      emb_dropout=0,#0.04,
                      lin_layer_dropouts=[0,0],#0.001,0.01
                     )

#training the model
no_of_epochs = 100 #1000
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(no_of_epochs):
    for y,cont_x, cat_x in dataloader:

        # Forward Pass
        preds = model(cont_x, cat_x)
        loss = criterion(preds, y)

        # Backward Pass and Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        loss_item = loss.item() / len(train_df) 
        #print(loss_item)
    # display the epoch training loss
    print("epoch : {}/{}, loss = {:.6f}".format(epoch + 1, no_of_epochs, loss_item))    

#

epoch : 1/100, loss = 0.001666
epoch : 2/100, loss = 0.000201
epoch : 3/100, loss = 0.003902
epoch : 4/100, loss = 0.003596
epoch : 5/100, loss = 9.564246
epoch : 6/100, loss = 0.001044
epoch : 7/100, loss = 0.005819
epoch : 8/100, loss = 0.007092
epoch : 9/100, loss = 0.003335
epoch : 10/100, loss = 0.006843
epoch : 11/100, loss = 0.004874
epoch : 12/100, loss = 0.007523
epoch : 13/100, loss = 0.002534
epoch : 14/100, loss = 0.004530
epoch : 15/100, loss = 0.002037
epoch : 16/100, loss = 0.004572
epoch : 17/100, loss = 0.007786
epoch : 18/100, loss = 0.005503
epoch : 19/100, loss = 0.007290
epoch : 20/100, loss = 0.005824
epoch : 21/100, loss = 0.005895
epoch : 22/100, loss = 0.007186
epoch : 23/100, loss = 0.006951
epoch : 24/100, loss = 0.005154
epoch : 25/100, loss = 0.004431
epoch : 26/100, loss = 0.007114
epoch : 27/100, loss = 0.006061
epoch : 28/100, loss = 0.004458
epoch : 29/100, loss = 0.006167
epoch : 30/100, loss = 0.007023
epoch : 31/100, loss = 0.007012
epoch : 32/100, l

In [43]:
"""
model.eval()

for name, param in model.named_parameters():
    print(name,param.shape)
"""

'\nmodel.eval()\n\nfor name, param in model.named_parameters():\n    print(name,param.shape)\n'

# Cosine similarity

In [12]:
#MERGING EMBEDDINGS TOGETHER

base_data = train_df_viz[['State','County']]

for layer_num,col_name in zip(range(0,32),cat_cols):
    df = pd.DataFrame([tensor_row.detach().numpy() for tensor_row in model.emb_layers[layer_num].weight])    
    map_to_county = pd.merge(train_df_viz[['State','County',col_name]],
                             df,left_on=col_name,#train_df_viz.index
                             right_on=df.index)
    if len(map_to_county)>0:
        base_data = base_data.merge(map_to_county,on=['State','County'])
    else:
        pass
    
base_data.drop(columns=['TotalPop'],inplace=True)    

In [13]:
cosine_similarity(get_county_from_df("Aroostook", "Maine", base_data).values[0][2:].reshape(1,-1),
                  get_county_from_df("Broward", "Florida", base_data).values[0][2:].reshape(1,-1))

array([[0.67270631]])

In [14]:
cosine_similarity(get_county_from_df("Aroostook", "Maine", base_data).values[0][2:].reshape(1,-1),
                  get_county_from_df("New York", "New York", base_data).values[0][2:].reshape(1,-1))

array([[0.99232508]])

In [15]:
embeddings_df = base_data[['State','County']]
embeddings_df['embeddings'] = [x for x in base_data[base_data.columns[2:]].values]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
new_york_ny = get_county_from_df("New York", "New York", embeddings_df).values[0][2:][0]

ny_similar = get_most_similar_counties(new_york_ny, embeddings_df, "new_yor_sim")

In [17]:
#counties least similar to ny
ny_similar.head(50)

Unnamed: 0,State,County,embeddings,new_yor_sim
1581,New Mexico,Grant,"[-0.5475952625274658, 0.47962602972984314, -1....",[[0.12372650374022576]]
2295,Texas,Hale,"[-1.336158037185669, -0.4527815282344818, -2.3...",[[0.14093499909281532]]
2369,Texas,Potter,"[-0.6584179997444153, 0.2620480954647064, -0.4...",[[0.18435065442407633]]
1582,New Mexico,Guadalupe,"[0.1728021502494812, -1.3545514345169067, 0.11...",[[0.18931511466094697]]
217,Colorado,Bent,"[-1.3654074668884277, -0.4398653507232666, -0....",[[0.1897409768474222]]
2236,Texas,Brewster,"[-0.6909804344177246, -2.1583809852600098, -1....",[[0.19923425298312095]]
370,Georgia,Clayton,"[0.5850638151168823, -0.3848890960216522, -1.4...",[[0.2045209504335937]]
240,Colorado,Las Animas,"[0.8691883087158203, 0.30765852332115173, -2.4...",[[0.20908382520083818]]
2101,South Dakota,Davison,"[-0.5370181798934937, 2.453603982925415, -0.88...",[[0.2288538159218259]]
1291,Mississippi,Jasper,"[0.3796567916870117, -2.1850745677948, -0.7291...",[[0.2310458409538341]]


In [18]:
#counties most similar
ny_similar.tail(60)

Unnamed: 0,State,County,embeddings,new_yor_sim
51,Alabama,Morgan,"[-0.19350174069404602, 0.4090951383113861, 0.2...",[[0.9948981699469059]]
2042,Rhode Island,Newport,"[-1.6319628953933716, 0.6096591353416443, -1.4...",[[0.9948992602618765]]
2605,West Virginia,Jefferson,"[-1.6044166088104248, -0.572356104850769, -0.2...",[[0.9949616706045294]]
928,Kentucky,Jackson,"[-0.7712630033493042, -1.0479376316070557, 0.1...",[[0.9949746261494391]]
1634,New York,Oneida,"[1.5458897352218628, 0.9800891280174255, -0.53...",[[0.9950051530020535]]
1975,Pennsylvania,Adams,"[1.883507251739502, -1.6423457860946655, -1.42...",[[0.9950394583952653]]
1808,Ohio,Fayette,"[-1.5053085088729858, 0.06192345544695854, 0.5...",[[0.9950596402527916]]
1110,Michigan,Benzie,"[-0.7164086699485779, -0.8618494272232056, 0.5...",[[0.995066685574515]]
866,Kansas,Scott,"[-0.2543816566467285, 1.9539371728897095, -1.0...",[[0.9950785725699556]]
605,Illinois,Rock Island,"[-3.341693639755249, -2.932926654815674, 0.827...",[[0.9951216247822674]]


In [19]:
kansas_chase = get_county_from_df("Allen", "Kansas", embeddings_df).values[0][2:][0]
kansas_similar = get_most_similar_counties(kansas_chase, embeddings_df, "kansas_allen_sim")
kansas_similar.tail(60)

Unnamed: 0,State,County,embeddings,kansas_allen_sim
903,Kentucky,Clinton,"[-0.9490610957145691, -0.4817316234111786, -2....",[[0.9568076530037397]]
2313,Texas,Hutchinson,"[0.01757507584989071, 0.6219032406806946, 0.82...",[[0.9570083256442826]]
1883,Oklahoma,Cleveland,"[-0.4391539394855499, -1.0032106637954712, -2....",[[0.957969341599314]]
235,Colorado,Jefferson,"[0.9730342030525208, -0.5554520487785339, -1.3...",[[0.9584909351856535]]
2242,Texas,Calhoun,"[-0.8625605702400208, 0.4982018768787384, -1.2...",[[0.9585320378987674]]
1950,Oregon,Douglas,"[2.3239943981170654, 1.691672921180725, -1.452...",[[0.9591027235352805]]
526,Idaho,Twin Falls,"[-1.058164358139038, 1.455926775932312, -1.515...",[[0.9605351316400692]]
1430,Missouri,Saline,"[-2.0146281719207764, -2.974017381668091, -1.2...",[[0.9610468248025397]]
1499,Nebraska,Jefferson,"[-2.215472459793091, -1.3721849918365479, -1.4...",[[0.9616444642951792]]
2562,Washington,Douglas,"[-0.790350079536438, 0.6243667006492615, -0.08...",[[0.9616755615495879]]


In [20]:
la_cali = get_county_from_df("Los Angeles", "California", embeddings_df).values[0][2:][0]
la_cali_similar = get_most_similar_counties(la_cali, embeddings_df, "la_cali_sim")
la_cali_similar.tail(60)

Unnamed: 0,State,County,embeddings,la_cali_sim
210,California,Ventura,"[1.1025316715240479, 1.9692202806472778, 1.189...",[[0.9938793796526167]]
1972,Oregon,Wasco,"[-1.4180936813354492, -1.1795707941055298, -0....",[[0.9939273219294724]]
265,Connecticut,Fairfield,"[0.45102232694625854, -0.8807605504989624, 0.0...",[[0.9940332564620028]]
215,Colorado,Arapahoe,"[-1.3381409645080566, 1.9944825172424316, -3.3...",[[0.9940496954751389]]
314,Florida,Madison,"[-3.9930503368377686, -0.16015620529651642, -0...",[[0.9940861520744665]]
2229,Texas,Bell,"[-4.142066955566406, -2.237694501876831, -2.28...",[[0.9940863928460629]]
2075,South Carolina,Lee,"[1.2311216592788696, 0.13137610256671906, -0.6...",[[0.9941330634893436]]
227,Colorado,Elbert,"[1.4158155918121338, 0.24357923865318298, -0.4...",[[0.994134757090374]]
1752,North Carolina,Vance,"[-0.452743262052536, 0.08964609354734421, -1.4...",[[0.9941403808446676]]
198,California,Santa Clara,"[-3.3662383556365967, -1.2425501346588135, -0....",[[0.9942065205950761]]


# Visualizing

In [21]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
# select a palette
from bokeh.palettes import Dark2_5
from bokeh.palettes import Spectral10, Category20c,Spectral
from bokeh.palettes import magma,linear_palette,cividis,inferno,viridis
#Turbo256
#diverging_palette,
import pandas as pd
import matplotlib
cust_cols = list(matplotlib.colors.cnames.values())
output_notebook()

In [22]:
def umap_embeddings_for_plotting(embeddings_df,num_counties,n_neighbors,min_dist,n_components,metric):
    val_arr = [ ]
    name_list = [ ]
    state_list =  []
    for embed in embeddings_df[["embeddings", "County", "State"]].values[0:num_counties]: 
        val_arr.append(embed[0])
        name_list.append(embed[1] +"_" +embed[2])
        state_list.append(embed[2])
        
    res = np.vstack(val_arr)

    reducer = umap.UMAP(n_neighbors=n_neighbors,
                        min_dist=min_dist,
                        n_components=n_components,
                        metric = metric #https://umap-learn.readthedocs.io/en/latest/parameters.html
                       )
    custer_embeddings = reducer.fit_transform(res)   
    
    return custer_embeddings,name_list, state_list

In [23]:
from sklearn.manifold import TSNE
def tsne_embeddings_for_plotting(embeddings_df,num_counties,perplexity,
                                 early_exaggeration,
                                 learning_rate,
                                 n_iter,
                                 random_state=None):
    val_arr = [ ]
    name_list = [ ]
    state_list =  []
    for embed in embeddings_df[["embeddings", "County", "State"]].values[0:num_counties]: 
        val_arr.append(embed[0])
        name_list.append(embed[1] +"_" +embed[2])
        state_list.append(embed[2])
        
    res = np.vstack(val_arr)

    tsne_embedded = TSNE(n_components=2,
                         perplexity=perplexity,
                         early_exaggeration=early_exaggeration,
                         learning_rate=learning_rate,
                         n_iter=n_iter,
                         random_state=random_state).fit_transform(res) 
    
    return tsne_embedded,name_list, state_list

In [24]:
def make_embedding_plot(low_dim_embeddings,labels,category):
    df = pd.DataFrame(low_dim_embeddings, columns=('x', 'y'))
    df['colors'] = category
    df['labels'] = labels
    datasource = ColumnDataSource(df)
    plot_figure = figure(title='UMAP projection Counties'
                         ,plot_width=800,
                         plot_height=700,
                         tools=('pan, wheel_zoom, reset'),
                         background_fill_color = "white"
                        )

    plot_figure.add_tools(HoverTool(tooltips="""
    <div>
    <div>
        <span style='font-size: 10px; color: #224499'></span>
        <span style='font-size: 10px'>@labels</span>
    </div>
    </div>
    """))

    color_mapping = CategoricalColorMapper(factors=list(set(category)), 
                                           #palette=magma
                                           palette=magma(len(set(category))+2)#[2:],
                                           #palette = Spectral[len(color_cats)+2]#[2:]
                                           #palette = Category20c[:len(color_cats)+2]#[2:]
                                           #palette= linear_palette(magma(len(set(color_cats))+2)
                                           #                        ,len(set(color_cats))+2)
                                           #palette = Category20c[20]#[2:]
                                           #palette = cust_cols[len(set(category)):len(set(category))+len(set(category))+2]
                                          )
    plot_figure.circle('x','y',source=datasource,
                       color=dict(field='colors', transform=color_mapping),
                       line_alpha=0.6,fill_alpha=0.6,
                       size=6,
                       #legend='colors'
                      )    
    show(plot_figure)

# All counties

In [25]:
embeddings_df_for_plotting = pd.merge(embeddings_df,
                                      grouped_covid_data[['region','sub_region','lat','long']],
                                      left_on = ['State','County'],
                                     right_on = ['region','sub_region'])
embeddings_df_for_plotting.sort_values(['lat','long','State'],inplace=True)
embeddings_df_for_plotting.drop(columns=['region','sub_region','lat','long'],inplace =True)

### Umap

In [26]:
custer_embeddings,name_list, state_list = umap_embeddings_for_plotting(embeddings_df,
                                                                       num_counties=2730,
                                                                       n_neighbors=5,
                                                                       min_dist=0.01,
                                                                       n_components=2,
                                                                      metric='cosine')
make_embedding_plot(custer_embeddings,name_list,state_list)



## T-sne

In [27]:
current_state = np.random.get_state()
#np.random.set_state(current_state)
custer_embeddings,name_list, state_list = tsne_embeddings_for_plotting(embeddings_df,
                                                                        num_counties=2730,
                                                                        perplexity=50,
                                                                        early_exaggeration=200,
                                                                        learning_rate=10,
                                                                        n_iter=1000,
                                                                       
                                                                       )
                                                                
make_embedding_plot(custer_embeddings,name_list,state_list)

## Plots by county

In [28]:
def generate_similarity_df(county,state,embeddings_df):
    state_county = get_county_from_df(county, state, embeddings_df).values[0][2:][0]
    similarity_df = get_most_similar_counties(state_county, embeddings_df, f'{state}_{county}_sim')
    similarity_df.sort_values([ f'{state}_{county}_sim'],ascending=False,inplace=True)
    similarity_df.drop(columns= [f'{state}_{county}_sim'],inplace=True)
    return similarity_df

In [29]:
similarity_df = generate_similarity_df("New York","New York",embeddings_df)
custer_embeddings,name_list, state_list = umap_embeddings_for_plotting(similarity_df,
                                                                       num_counties=500,
                                                                       n_neighbors=5,
                                                                       min_dist=0,
                                                                       n_components=2,
                                                                      metric='cosine')
make_embedding_plot(custer_embeddings,name_list,state_list)

In [30]:
similarity_df = generate_similarity_df("El Paso","Texas",embeddings_df)
custer_embeddings,name_list, state_list = umap_embeddings_for_plotting(similarity_df,
                                                                       num_counties=500,
                                                                       n_neighbors=15,
                                                                       min_dist=0,
                                                                       n_components=2,
                                                                      metric='cosine')
make_embedding_plot(custer_embeddings,name_list,state_list)

# Results

In [31]:

res_dir = r'C:\Users\Cafral\Desktop\kaggle\team_ts_forecast\embeddings\results\cat2vec\v2'
embeddings_df.to_pickle(res_dir+r'\county_embeddings_with_pop_density.csv')