In [39]:
import sys
import pandas as pd
import pickle
sys.path.append('../')

In [40]:
from api.utils.transformers import *

In [41]:
from scipy.stats import spearmanr

### Generate Training Data

In [42]:
generator = TrainingDataGenerator('../../Welcome-Centre-DataCorps-Data/ClientDatabaseStructure.mdb.sqlite')
training_data = generator.get_training_data(limit=100000)

### Build your Transformer and Choose Features

In [43]:
features_to_split = []

transformer = TransformerPipeline([
            ConsolidateTablesTransformer(count_encode=True),
            AddFutureReferralTargetFeatures(),
            TimeFeatureTransformer(break_length=28),
            TimeWindowFeatures(windows=[1,4,16,32,52]),
            SplitCurrentAndEverTransformer(['ReferralIssue_', 'ReferralDomesticCircumstances_',
                                            'ReferralReason_', 'ReferralBenefit_'])
        ], aligner=AlignFeaturesToColumnSchemaTransformer())

X, y, referral_table = transformer.fit_transform(training_data)

X = X.fillna(0)

100%|██████████| 107/107 [00:12<00:00,  8.62it/s]
  lambda k: k.groupby(pd.TimeGrouper('1W', convention='e')).size())


In [44]:
referral_table['Referral_ReferralTakenDate'].max()

X = X[referral_table['Referral_ReferralTakenDate'] < pd.to_datetime('2016-04-11')]

sorted_dates = referral_table.loc[X.index]['Referral_ReferralTakenDate'].sort_values()
test_start_date = sorted_dates.iloc[int(len(sorted_dates) * 0.8)]

X_train = X[referral_table['Referral_ReferralTakenDate'] < test_start_date]
X_test = X[referral_table['Referral_ReferralTakenDate'] >= test_start_date]
y_train = y.loc[X_train.index]
y_test = y.loc[X_test.index]

  
  if __name__ == '__main__':


# Build a model

In [7]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import RidgeCV

In [8]:
weight = 1/X_train['TimeFeature_IndexInBurst']

In [None]:
eth_columns = X_train.filter(like='Eth').columns
country_columns = X_train.filter(like='CountryID').columns

In [63]:
X_train_censored = X_train.drop(eth_columns, axis=1).drop(country_columns, axis=1)
X_test_censored = X_test.drop(eth_columns, axis=1).drop(country_columns, axis=1)
X_train_no_eth = X_train.drop(eth_columns, axis=1)
X_test_no_eth = X_test.drop(eth_columns, axis=1)
X_train_no_country = X_train.drop(country_columns, axis=1)
X_test_no_country = X_test.drop(country_columns, axis=1)

In [18]:
et = ExtraTreesRegressor(n_jobs=-1, n_estimators=500)
et.fit(X_train, y_train, sample_weight=weight.loc[X_train.index])

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
          oob_score=False, random_state=None, verbose=0, warm_start=False)

In [19]:
pd.Series(et.feature_importances_, X_train.columns).sort_values()

ReferralDietaryRequirements_34              0.000000e+00
ReferralReason_11_Ever                      0.000000e+00
ReferralIssue_47_Current                    0.000000e+00
ReferralDietaryRequirements_36              0.000000e+00
ReferralReason_11_Current                   0.000000e+00
ReferralIssue_47_Ever                       0.000000e+00
ReferralBenefit_2_Current                   8.738555e-09
ReferralBenefit_2_Ever                      1.036778e-08
ClientIssue_17                              1.193749e-08
ReferralIssue_9_Current                     1.273332e-08
ReferralIssue_9_Ever                        4.934163e-08
Client_AddressLocalityId_8                  3.918752e-07
ReferralIssue_2_Ever                        7.835760e-07
ReferralIssue_13_Current                    8.420719e-07
ReferralReason_12_Current                   1.336026e-06
ReferralIssue_31_Ever                       2.380954e-06
ReferralReason_12_Ever                      4.171768e-06
ClientIssue_15                 

# Evaluate model

In [23]:
y_pred = pd.Series(et.predict(X_test), X_test.index)

In [24]:
test_referral_table = referral_table.loc[X_test.index]

In [25]:
def get_scores_per_window(x, y, group, threshold=0.1):
    corr = spearmanr(x, y)[0]
    mu_a = x.groupby(group).mean()
    mu_p = y.groupby(group).mean()
    mu_a_top = mu_a[mu_a.rank(ascending=False) / len(mu_a) < threshold]
    mu_p_top = mu_p[mu_p.rank(ascending=False) / len(mu_p) < threshold]
    overlap = mu_p_top.index.isin(mu_a_top.index).mean()
    return pd.Series([corr, overlap], index=['spearman', 'overlap'])

In [26]:
small_referral_table = test_referral_table[test_referral_table['TimeFeature_ReferralNumber']<6]

In [70]:
def get_weekly_scores(referral_table):
    grouped_y = referral_table.assign(y=y_test, pred=y_pred).set_index('Referral_ReferralTakenDate')\
        .groupby([pd.TimeGrouper('1W'), 'Client_ClientId'])['y'].mean()
    grouped_pred_y = referral_table.assign(y=y_test, pred=y_pred).set_index('Referral_ReferralTakenDate')\
            .groupby([pd.TimeGrouper('1W'), 'Client_ClientId'])['pred'].mean()
    combined = pd.concat([grouped_y, grouped_pred_y], axis=1)

    results = []

    for i in combined.index.get_level_values(0).unique():
        row = (i,)
        row += (len(combined.loc[i]),)
        row += (spearmanr(combined.loc[i]['y'], combined.loc[i]['pred'])[0],)
        ranked = (combined.loc[i].rank(ascending=False) / len(combined.loc[i]))
        for threshold in [0.1, 0.25, 0.5]:
            severe = ranked < threshold
            row += (severe[severe['pred']]['y'].mean(),)

        results.append(row)
    return pd.DataFrame(results, columns=['date', 'n', 'corr', 'prec10', 'prec25', 'prec50'])

# Evaluate Different Models

Base

In [75]:
et = ExtraTreesRegressor(n_jobs=-1, n_estimators=500)
et.fit(X_train, y_train, sample_weight=weight.loc[X_train.index])
y_pred = pd.Series(et.predict(X_test), X_test.index)
test_referral_table = referral_table.loc[X_test.index]
small_referral_table = test_referral_table[test_referral_table['TimeFeature_ReferralNumber']<6]
results_big = get_weekly_scores(test_referral_table)
results_small = get_weekly_scores(small_referral_table)
print(results_small.mean())
print(results_big.mean())

  
  This is separate from the ipykernel package so we can avoid doing imports until


n         67.217391
corr       0.353627
prec10     0.346273
prec25     0.457589
prec50     0.593056
dtype: float64
n         104.130435
corr        0.542407
prec10      0.606599
prec25      0.621573
prec50      0.715268
dtype: float64


Remove both country and ethnicity

In [77]:
et = ExtraTreesRegressor(n_jobs=-1, n_estimators=500)
et.fit(X_train_censored, y_train, sample_weight=weight.loc[X_train.index])
y_pred = pd.Series(et.predict(X_test_censored), X_test.index)
test_referral_table = referral_table.loc[X_test.index]
small_referral_table = test_referral_table[test_referral_table['TimeFeature_ReferralNumber']<6]
results_big = get_weekly_scores(test_referral_table)
results_small = get_weekly_scores(small_referral_table)
print(results_small.mean())
print(results_big.mean())

  
  This is separate from the ipykernel package so we can avoid doing imports until


n         67.217391
corr       0.321240
prec10     0.280176
prec25     0.451590
prec50     0.586696
dtype: float64
n         104.130435
corr        0.514443
prec10      0.587945
prec25      0.603878
prec50      0.702585
dtype: float64


Remove just ethnicity

In [78]:
et = ExtraTreesRegressor(n_jobs=-1, n_estimators=500)
et.fit(X_train_no_eth, y_train, sample_weight=weight.loc[X_train.index])
y_pred = pd.Series(et.predict(X_test_no_eth), X_test.index)
test_referral_table = referral_table.loc[X_test.index]
small_referral_table = test_referral_table[test_referral_table['TimeFeature_ReferralNumber']<6]
results_big = get_weekly_scores(test_referral_table)
results_small = get_weekly_scores(small_referral_table)
print(results_big.mean())
print(results_small.mean())

  
  This is separate from the ipykernel package so we can avoid doing imports until


n         104.130435
corr        0.541455
prec10      0.574901
prec25      0.629613
prec50      0.712592
dtype: float64
n         67.217391
corr       0.365617
prec10     0.347101
prec25     0.486795
prec50     0.603661
dtype: float64


Remove just country

In [79]:
et = ExtraTreesRegressor(n_jobs=-1, n_estimators=500)
et.fit(X_train_no_country, y_train, sample_weight=weight.loc[X_train.index])
y_pred = pd.Series(et.predict(X_test_no_country), X_test.index)
test_referral_table = referral_table.loc[X_test.index]
small_referral_table = test_referral_table[test_referral_table['TimeFeature_ReferralNumber']<6]
results_big = get_weekly_scores(test_referral_table)
results_small = get_weekly_scores(small_referral_table)
print(results_big.mean())
print(results_small.mean())

  
  This is separate from the ipykernel package so we can avoid doing imports until


n         104.130435
corr        0.531337
prec10      0.605786
prec25      0.616999
prec50      0.712088
dtype: float64
n         67.217391
corr       0.342183
prec10     0.314700
prec25     0.447631
prec50     0.596016
dtype: float64
