In [1]:
import sys
import pandas as pd
import pickle
sys.path.append('../')

In [2]:
from api.utils.transformers import *

In [3]:
from scipy.stats import spearmanr

### Generate Training Data

In [4]:
generator = TrainingDataGenerator('../../Welcome-Centre-DataCorps-Data/ClientDatabaseStructure.mdb.sqlite')
training_data = generator.get_training_data(limit=100000)

### Build your Transformer and Choose Features

In [8]:
features_to_split = []

transformer = TransformerPipeline([
            ConsolidateTablesTransformer(),
            AddFutureReferralTargetFeatures(),
            TimeFeatureTransformer(break_length=28),
            SplitCurrentAndEverTransformer(['ReferralIssue_', 'ReferralDomesticCircumstances_',
                                            'ReferralReason_', 'ReferralBenefit_'])
        ], aligner=AlignFeaturesToColumnSchemaTransformer())

X, y, referral_table = transformer.fit_transform(training_data)

X = X.fillna(0)

100%|██████████| 107/107 [00:14<00:00,  7.53it/s]


In [9]:
X_train = X[X.index < 13500]
X_test = X[X.index >= 13500]
y_train = y.loc[X_train.index]
y_test = y.loc[X_test.index]

# Build a model

In [10]:
from sklearn.ensemble import ExtraTreesRegressor

In [11]:
et = ExtraTreesRegressor(n_jobs=-1, n_estimators=500)
et.fit(X_train, y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
          oob_score=False, random_state=None, verbose=0, warm_start=False)

# Evaluate model

In [12]:
y_pred = pd.Series(et.predict(X_test), X_test.index)

In [13]:
test_referral_table = referral_table.loc[X_test.index]

In [14]:
def get_scores_per_window(x, y, group, threshold=0.50):
    corr = spearmanr(x, y)[0]
    mu_a = x.groupby(group).mean()
    mu_p = y.groupby(group).mean()
    mu_a_top = mu_a[mu_a.rank(ascending=False) / len(mu_a) < threshold]
    mu_p_top = mu_p[mu_p.rank(ascending=False) / len(mu_p) < threshold]
    overlap = mu_p_top.index.isin(mu_a_top.index).mean()
    return pd.Series([corr, overlap], index=['spearman', 'overlap'])

In [15]:
def evaluate_average_weekly_rank_correlation(test_referral_table, y_test, y_pred):
    grouped_y = test_referral_table.assign(y=y_test, pred=y_pred).set_index('Referral_ReferralTakenDate')\
        .groupby([pd.TimeGrouper('1W'), 'Client_ClientId'])['y'].mean()
    grouped_pred_y = test_referral_table.assign(y=y_test, pred=y_pred).set_index('Referral_ReferralTakenDate')\
        .groupby([pd.TimeGrouper('1W'), 'Client_ClientId'])['pred'].mean()
    time_grouped = pd.concat([grouped_y, grouped_pred_y], axis=1)
    return time_grouped.reset_index().groupby(['Referral_ReferralTakenDate']).\
        apply(lambda k: get_scores_per_window(k['y'], k['pred'], k['Client_ClientId'])).dropna().mean()

In [16]:
evaluate_average_weekly_rank_correlation(test_referral_table, y_test, y_pred)

  
  This is separate from the ipykernel package so we can avoid doing imports until
  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


spearman    0.508887
overlap     0.666668
dtype: float64

# Save your model

In [None]:
from api.utils.models import TWCModel

In [None]:
model = TWCModel()
model.transformer = transformer
model.model = et
model.save('etmodel.p')

In [None]:
model2 = TWCModel()

In [None]:
model2.load('etmodel.p')

### Test your model works on json test file

In [None]:
with open('../api/twc_sample_request.json') as f:
    json_string = f.read()

In [None]:
p = ParseJSONToTablesTransformer()
example_tables = p.fit_transform(json_string)

In [None]:
model2.transformer.fit_transform(example_tables)