In [1]:
import sys
import pandas as pd
import pickle
sys.path.append('../')

%load_ext autoreload
%autoreload

In [2]:
from api.utils.transformers import *

In [3]:
from scipy.stats import spearmanr

### Generate Training Data

In [4]:
generator = TrainingDataGenerator('../../Welcome-Centre-DataCorps-Data/ClientDatabaseStructure.mdb.sqlite')
training_data = generator.get_training_data(limit=100000)

In [5]:
training_data.keys()

dict_keys(['referraldocument', 'referralissue', 'client', 'referraldietaryrequirements', 'referraldomesticcircumstances', 'referralbenefit', 'clientissue', 'referral', 'referralreason'])

### Build your Transformer and Choose Features

In [6]:
features_to_split = []

transformer = TransformerPipeline([
                        ConsolidateTablesTransformer(count_encode=False),
                        AddFutureReferralTargetFeatures(),
                        TimeFeatureTransformer(break_length=28),
                        SplitCurrentAndEverTransformer(['referralissue_', 
                                                       'referraldomesticcircumstances_',
                                                        'referralreason_', 'referralbenefit_'])
                                    ], aligner=AlignFeaturesToColumnSchemaTransformer())

X, y, referral_table = transformer.fit_transform(training_data)

X = X.fillna(0)

100%|██████████| 107/107 [00:15<00:00,  6.97it/s]


In [7]:
X_train = X[X.index < 13500]
X_test = X[X.index >= 13500]
y_train = y.loc[X_train.index]
y_test = y.loc[X_test.index]

# Build a model

In [8]:
from sklearn.ensemble import ExtraTreesRegressor

In [9]:
et = ExtraTreesRegressor(n_jobs=-1, n_estimators=500)
et.fit(X_train, y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
          oob_score=False, random_state=None, verbose=0, warm_start=False)

# Evaluate model

In [10]:
y_pred = pd.Series(et.predict(X_test), X_test.index)

In [11]:
test_referral_table = referral_table.loc[X_test.index]

In [12]:
def get_scores_per_window(x, y, group, threshold=0.50):
    corr = spearmanr(x, y)[0]
    mu_a = x.groupby(group).mean()
    mu_p = y.groupby(group).mean()
    mu_a_top = mu_a[mu_a.rank(ascending=False) / len(mu_a) < threshold]
    mu_p_top = mu_p[mu_p.rank(ascending=False) / len(mu_p) < threshold]
    overlap = mu_p_top.index.isin(mu_a_top.index).mean()
    return pd.Series([corr, overlap], index=['spearman', 'overlap'])

In [15]:
def evaluate_average_weekly_rank_correlation(test_referral_table, y_test, y_pred):
    grouped_y = test_referral_table.assign(y=y_test, pred=y_pred).set_index('referral_referraltakendate')\
        .groupby([pd.TimeGrouper('1W'), 'client_clientid'])['y'].mean()
    grouped_pred_y = test_referral_table.assign(y=y_test, pred=y_pred).set_index('referral_referraltakendate')\
        .groupby([pd.TimeGrouper('1W'), 'client_clientid'])['pred'].mean()
    time_grouped = pd.concat([grouped_y, grouped_pred_y], axis=1)
    return time_grouped.reset_index().groupby(['referral_referraltakendate']).\
        apply(lambda k: get_scores_per_window(k['y'], k['pred'], k['client_clientid'])).dropna().mean()

In [16]:
evaluate_average_weekly_rank_correlation(test_referral_table, y_test, y_pred)

  
  This is separate from the ipykernel package so we can avoid doing imports until
  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


spearman    0.498007
overlap     0.657884
dtype: float64

In [23]:
referral_table[referral_table['client_clientid']==1872]

Unnamed: 0,referral_statusid,referral_referralonhold,referral_referraltakendate,referral_referralreadydate,referral_referralcollecteddate,referral_referralworkerid,referral_referralhandedworkerid,referral_clientid,referral_partnername,referral_partnerid,...,referral_referralnotes,referral_updatetimestamp,client_clientid,reference_date,futurereferraltargetfeature_futurereferralcount,futurereferraltargetfeature_futurereferralscore,futurereferraltargetfeature_futurereferralgaps,timefeature_referralnumber,timefeature_burstnumber,timefeature_totalreferralsforclient
3238,6,0,2014-08-26 10:58:00,,,4,,1872,,0.0,...,This was a test,12/03/14 09:48:55,1872.0,NaT,0.0,0.0,0.0,1.0,1.0,2
16763,6,0,2017-02-03 15:23:00,,,4,,1872,,,...,Test data only,02/03/17 15:28:52,1872.0,NaT,0.0,0.0,0.0,2.0,2.0,2


# Save your model

In [17]:
from api.utils.models import TWCModel

In [18]:
%pwd

'/Users/davidsykes/Git-repos/welcome-centre/Welcome-Centre-DataCorps-Code/notebooks'

In [19]:
model = TWCModel()
model.transformer = transformer
model.model = et
model.save('../api/etmodel.p')

In [None]:
model2 = TWCModel()

In [None]:
model2.load('etmodel.p')

### Test your model works on json test file

In [None]:
with open('../api/twc_sample_request.json') as f:
    json_string = f.read()

In [None]:
p = ParseJSONToTablesTransformer()
example_tables = p.fit_transform(json_string)

In [None]:
model2.transformer.fit_transform(example_tables)