In [21]:
import sys
import pandas as pd
import pickle
sys.path.append('../twc_api/')
import boto3
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
session = boto3.Session(profile_name='twc')
s3 = session.resource('s3')

In [2]:
from api.utils.transformers import *
from api.utils.aws import *

In [3]:
from scipy.stats import spearmanr

### Generate Training Data

In [4]:
generator = TrainingDataGenerator('../../Welcome-Centre-DataCorps-Data/ClientDatabaseStructure.mdb.sqlite')
training_data = generator.get_training_data(limit=1000)

In [5]:
training_data.keys()

dict_keys(['referralreason', 'referralbenefit', 'referraldietaryrequirements', 'referraldomesticcircumstances', 'referralissue', 'clientissue', 'referral', 'referraldocument', 'client'])

### Build your Transformer and Choose Features

In [6]:
features_to_split = []

transformer = TransformerPipeline([
                        ConsolidateTablesTransformer(count_encode=False),
                        AddFutureReferralTargetFeatures(),
                        TimeFeatureTransformer(break_length=28),
                        SplitCurrentAndEverTransformer(['referralissue_', 
                                                       'referraldomesticcircumstances_',
                                                        'referralreason_', 'referralbenefit_'])
                                    ], aligner=AlignFeaturesToColumnSchemaTransformer())

X, y, referral_table = transformer.fit_transform(training_data)

X = X.fillna(0)

In [11]:
X_train = X[X.index < 900]
X_test = X[X.index >= 900]
y_train = y.loc[X_train.index]
y_test = y.loc[X_test.index]

# Build a model

In [12]:
from sklearn.ensemble import ExtraTreesRegressor

In [13]:
et = ExtraTreesRegressor(n_jobs=-1, n_estimators=5)
et.fit(X_train, y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=-1,
          oob_score=False, random_state=None, verbose=0, warm_start=False)

# Evaluate model

In [14]:
y_pred = pd.Series(et.predict(X_test), X_test.index)

In [15]:
test_referral_table = referral_table.loc[X_test.index]

In [16]:
def get_scores_per_window(x, y, group, threshold=0.50):
    corr = spearmanr(x, y)[0]
    mu_a = x.groupby(group).mean()
    mu_p = y.groupby(group).mean()
    mu_a_top = mu_a[mu_a.rank(ascending=False) / len(mu_a) < threshold]
    mu_p_top = mu_p[mu_p.rank(ascending=False) / len(mu_p) < threshold]
    overlap = mu_p_top.index.isin(mu_a_top.index).mean()
    return pd.Series([corr, overlap], index=['spearman', 'overlap'])

In [17]:
def evaluate_average_weekly_rank_correlation(test_referral_table, y_test, y_pred):
    grouped_y = test_referral_table.assign(y=y_test, pred=y_pred).set_index('referral_referraltakendate')\
        .groupby([pd.TimeGrouper('1W'), 'client_clientid'])['y'].mean()
    grouped_pred_y = test_referral_table.assign(y=y_test, pred=y_pred).set_index('referral_referraltakendate')\
        .groupby([pd.TimeGrouper('1W'), 'client_clientid'])['pred'].mean()
    time_grouped = pd.concat([grouped_y, grouped_pred_y], axis=1)
    return time_grouped.reset_index().groupby(['referral_referraltakendate']).\
        apply(lambda k: get_scores_per_window(k['y'], k['pred'], k['client_clientid'])).dropna().mean()

In [18]:
evaluate_average_weekly_rank_correlation(test_referral_table, y_test, y_pred)

  
  This is separate from the ipykernel package so we can avoid doing imports until
  import sys
  ret = ret.dtype.type(ret / rcount)
  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


spearman    0.132473
overlap     0.208333
dtype: float64

# Save your model

In [39]:
from api.utils.models import TWCModel
import tempfile

In [46]:
def save_model_from_notebook(s3_resource, model, version=None, bucket_name=None):
    """If you are doing this outside of the app context (e.g. in notebook)"""
    tf = tempfile.NamedTemporaryFile(delete=False)
    with open(tf.name, 'wb') as fh:
        pickle.dump({'model': model, 'version': version}, fh)
    bucket = s3_resource.Bucket(bucket_name)
    bucket.upload_file(tf.name, 'twc_model_' + str(version))
    os.remove(tf.name)

model = TWCModel()
model.transformer = transformer
model.model = et
save_model_from_notebook(s3, model, version='12345', bucket_name='twc-models')

In [None]:
model2.load('etmodel.p')

### Test your model works on json test file

In [None]:
with open('../api/twc_sample_request.json') as f:
    json_string = f.read()

In [None]:
p = ParseJSONToTablesTransformer()
example_tables = p.fit_transform(json_string)

In [None]:
model2.transformer.fit_transform(example_tables)