# Training & Transformation

The tidy data collected during scrapes needs to be widened (one-hot encoded or embedded) prior to training.  FastMap can then pin a model with a X_transformed for future predictions.

NOTE:  The perceptron isn't behaving well with this dataset.  Moving to sklearn's logistic regression for a bit more stability and pipeline constructs.

In [1]:
# General Imports
import pandas as pd
import numpy as np
from db import get_session
from models import *

## Filter relevant data (within square boundary)

Given a model centroid, get data within given radius.  For earlier versions, this can be done as a square, but future methods may require redesigning query filter statement.

In [2]:
def get_near_data(center_coord, radius):
    lat_range = [center_coord[0]-radius, center_coord[0]+radius]
    lon_range = [center_coord[1]-radius, center_coord[1]+radius]
    
    with get_session() as session:
        response = session.query(
            SearchResults.latitude, SearchResults.longitude,
            SearchResults.category, SearchResults.num_unique).\
                filter(
                    SearchResults.latitude > lat_range[0],
                    SearchResults.latitude < lat_range[1],
                    SearchResults.longitude > lon_range[0],
                    SearchResults.longitude < lon_range[1]).all()
    return response

data = get_near_data((32.715736, -117.161087), 0.5)

In [3]:
# Transform data into dataframe for encoding
columns = ['lat', 'lon', 'cat', 'num_unique']
df = pd.DataFrame(data, columns=columns)
df.head()

Unnamed: 0,lat,lon,cat,num_unique
0,32.915736,-117.411087,auto,50
1,32.915736,-117.411087,localservices,50
2,32.915736,-117.411087,physicians,50
3,32.915736,-117.411087,restaurants,50
4,32.915736,-117.411087,realestate,50


One-hot encoding is probably not ideal.  Categories change.  Looking at a simple word embedding for the category is likely a far more robust solution in the event categories are introduced that don't meet existing.  Then the one-hot isn't needed, just the expansion of a the vector list!

In [4]:
# Quick example with get_dummies.  Not suitable for final use.
X = pd.get_dummies(df, columns=['cat']).drop(columns='num_unique').to_numpy()
y = df.num_unique.to_numpy().reshape(-1,1)

In [5]:
# DO IT MANUALLY CUZ F' SCIKIT LEARN
from  read_query import list_categories
from app_global import g

def get_categories():
    if not hasattr(g, 'categories'):    
        g.categories = list_categories(with_id=True)
    return g.categories

def encode_cat(x):
    categories = get_categories()
    temp_arr = np.zeros(len(categories.keys()))
    temp_arr[categories[x]-1] = 1
    return temp_arr

## Try to create binary vector for each row.
df.cat.apply(encode_cat)

0      [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1      [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2      [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3      [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4      [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
                             ...                        
368    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
369    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
370    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
371    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
372    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: cat, Length: 373, dtype: object

In [6]:
def transform_observation(obs):
    assert len(obs) == 3
    return np.concatenate(
        (
            [obs[0]], [obs[1]], encode_cat(obs[2])
        ),
        axis=None
    )

def truncate_x(obs):
    return obs[0:3]

def truncate_y(obs):
    return obs[-1]


transform_observation(truncate_x(data[0]))

array([  32.915736, -117.411087,    1.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ])

In [7]:
def split_transform(data):
    # The perceptron is sigmoid activated.  y needs to be scaled to 0:1.
    X_raw = list(map(
            truncate_x, data
        ))
    X = np.array(list(map(
            transform_observation, X_raw
        )), dtype='f')
    y = np.array(list(map(
            truncate_y, data
        )), dtype='f').reshape(-1,1)
    return X, y/50.

X, y = split_transform(data)

In [8]:
display(X[0], y[0])

array([  32.915737, -117.41109 ,    1.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ,
          0.      ,    0.      ,    0.      ,    0.      ,    0.      ],
      dtype=float32)

array([1.], dtype=float32)

In [9]:
# Boom.  Hand implemented one-hot encoding with X, y split that will totally drop data
#    if input is > 4 and throw a key error if category added to db during run.

## Pin model with data

Using X, y

In [10]:
import lens

In [11]:
display(X.shape, y.shape)

(373, 65)

(373, 1)

In [24]:
# Initialize  modelmap
modelmap = lens.ModelMap(
    center_coord = [32.715736, -117.161087]
)

In [25]:
# Pin model
model_info = modelmap.pin_model(
    X=X,
    y=y,
    coordinates = [32.715736, -117.161087]
)
model_info

ValueError: shapes (373,373) and (1,10) not aligned: 373 (dim 1) != 1 (dim 0)

### Save model data to database for later use

The database will maintain a list of available perceptrons.  

Note: Currently, these are stored in a temporary folder and so the database references are not valid after garbage collection or system restarts.  This table data will have to be dropped at the start of a run


In [61]:
from write_query import write_model_metadata

write_model_metadata(model_info)

### Check Cache for Model Perseverence in Memory

The cache should be keeping a number of the most recent or most used models for quick access.  This way it does not have to be loaded from disk.

In [62]:
modelmap.cache

{1577501386.5982246: {'model': <lens.perceptron.Perceptron at 0x7f7d2ef69490>,
  'geohash': '9mudjgtyuz1w'}}

### Create datapoint X for prediction of num_unique

Input X_i must match the format of X_train

In [63]:
def prep_input(x: tuple):
    return np.array(x).reshape(1, -1)

X_test, y = split_transform(prep_input(data[0]))

Note:  X is always the first out.  Though slightly inefficient, no logic is particulary needed to NOT return y.  

In [64]:
# manually from cache
model = modelmap.cache[list(modelmap.cache.keys())[0]]['model']
# a lot to get the first item haha

In [66]:
model.predict(X_test)*50

array([[25.]])

# General Pipeline Construction

The single perceptron did not yield good resuslts, yielding effective averages instead of the learning desired.  This could be due to sparsity of the training set or fickle training hyperparameters.  

Because of this, and the development of the input/training set tranformation functions, a move toward logistic regression will be tried.  Another thing gained by moving to Scikit-Learn's is pipeline functionality out of the box and access to a number of scaling functions.

In [2]:
# Get Training Data around: 
from read_query import get_near_data

test_center = [32.715736, -117.161087]
data = get_near_data(center_coord=test_center, radius=0.5)
data[0:10]

[(32.915736, -117.411087, 'auto', 50),
 (32.915736, -117.411087, 'localservices', 50),
 (32.915736, -117.411087, 'physicians', 50),
 (32.915736, -117.411087, 'restaurants', 50),
 (32.915736, -117.411087, 'realestate', 50),
 (32.915736, -117.411087, 'farms', 14),
 (32.915736, -117.411087, 'active', 49),
 (32.915736, -117.411087, 'partyequipmentrentals', 50),
 (32.915736, -117.411087, 'plumbing', 50),
 (32.915736, -117.411087, 'eventservices', 43)]

In [3]:
import lens

X, y = lens.split_widen_data(data)

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

lregr = LogisticRegression(
    random_state=420,
    solver='liblinear',
)

standard_scaler = StandardScaler()

tpipe = Pipeline([
    ('scaler', standard_scaler),
    ('logistic', lregr),
])

In [4]:
tpipe.fit(X, y)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logistic',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=420,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [23]:
num = 65
test_data = X[num]
display(
    tpipe.predict(test_data.reshape(1,-1)),
    print('actual:', y[num]))

actual: 38.0


array([12.], dtype=float32)

None

In [4]:
# Attempt to pin pipeline model
from lens import ModelMap

# Initialize  modelmap
modelmap = lens.ModelMap(
    center_coord = [32.715736, -117.161087]
)
# Pin model
model_info = modelmap.pin_model(
    X=X,
    y=y,
    coordinates = [32.715736, -117.161087]
)
model_info

{'geohash': '9mudjgtyuz1w',
 'latitude': 32.715736,
 'longitude': -117.161087,
 'radius': 0.05,
 'observations': 373,
 'file_location': '/tmp/9mudjgtyuz1w.pkl'}

In [6]:
# manually from cache
model = modelmap.cache[list(modelmap.cache.keys())[0]]['model']
model.predict(X[65].reshape(1,-1))

array([12.], dtype=float32)