In [1]:
import sys
import pandas as pd
import pickle
sys.path.append('../')

In [2]:
from api.utils.transformers import (FullTransformer, TrainingDataGenerator, ParseJSONToTablesTransformer)

### Generate Training Data

In [3]:
generator = TrainingDataGenerator('../../Welcome-Centre-DataCorps-Data/ClientDatabaseStructure.mdb.sqlite')
training_data = generator.get_training_data(limit=100000)

### Build your Transformer and Choose Features

In [4]:
features_to_split = []
column_schema = [
 'Referral_LivingWithPartner',
 'ReferralIssue_1',
 'ReferralIssue_2',
 'ReferralIssue_3',
 'ReferralIssue_4',
 'ReferralIssue_7',
 'ReferralIssue_8',
 'ReferralIssue_9',
 'ReferralIssue_10',
 'ReferralIssue_11',
 'ReferralIssue_12',
 'ReferralIssue_13',
 'ReferralIssue_14',
 'ReferralIssue_15',
 'ReferralIssue_16',
 'ReferralIssue_17',
 'ReferralIssue_18',
 'ReferralDocument_1',
 'ReferralDocument_2',
 'ReferralDocument_3',
 'ReferralDocument_4',
 'ReferralDocument_5',
 'ReferralBenefit_1',
 'ReferralBenefit_2',
 'ReferralBenefit_3',
 'ReferralBenefit_4',
 'ReferralBenefit_5',
 'ReferralBenefit_7',
 'ReferralBenefit_8',
 'ReferralBenefit_9',
 'ReferralBenefit_10',
 'ReferralBenefit_11',
 'ReferralBenefit_12',
 'ReferralBenefit_13',
 'ReferralBenefit_14',
 'ReferralReason_1',
 'ReferralReason_3',
 'ReferralReason_5',
 'ReferralReason_7',
 'ReferralReason_8',
 'ReferralReason_9',
 'ReferralReason_11',
 'ReferralReason_12',
 'ReferralReason_13',
 'ReferralReason_14',
 'ReferralReason_16',
 'ReferralReason_18',
 'ReferralReason_19',
 'ReferralReason_21',
 'ReferralReason_24',
 'ReferralReason_25',
 'ReferralReason_26',
 'ReferralReason_27',
 'ReferralReason_28',
 'ReferralReason_29',
 'ReferralReason_30',
 'ReferralReason_31',
 'ReferralReason_32',
 'ReferralReason_33',
 'ReferralReason_34',
 'ReferralReason_35',
 'ReferralReason_36',
 'ReferralReason_37',
 'ReferralReason_38',
 'ReferralReason_39',
 'ReferralReason_40',
 'ReferralReason_41',
 'ReferralReason_42',
 'ReferralReason_43',
 'ReferralReason_44',
 'ReferralReason_45',
 'ReferralReason_46',
 'ReferralReason_47',
 'ReferralReason_48',
 'ReferralReason_49',
 'ReferralReason_51',
 'ReferralReason_52',
 'ReferralReason_53',
 'ReferralReason_54',
 'ReferralReason_55',
 'ReferralReason_56',
 'ReferralReason_57',
 'ReferralReason_58',
 'ReferralReason_59',
 'ReferralDietaryRequirements_1',
 'ReferralDietaryRequirements_2',
 'ReferralDietaryRequirements_3',
 'ReferralDietaryRequirements_4',
 'ReferralDietaryRequirements_5',
 'ReferralDietaryRequirements_6',
 'ReferralDietaryRequirements_7',
 'ReferralDietaryRequirements_8',
 'ReferralDietaryRequirements_9',
 'ReferralDietaryRequirements_10',
 'ReferralDietaryRequirements_11',
 'ReferralDietaryRequirements_12',
 'ReferralDomesticCircumstances_1',
 'ReferralDomesticCircumstances_2',
 'ReferralDomesticCircumstances_3',
 'ReferralDomesticCircumstances_4',
 'ReferralDomesticCircumstances_5',
 'ReferralDomesticCircumstances_6',
                ]
transformer = FullTransformer([], column_schema)
X, y, referral_table = transformer.fit_transform(training_data)

X = X.fillna(0)

100%|██████████| 107/107 [00:18<00:00,  5.84it/s]


In [5]:
X_train = X[X.index < 13500]
X_test = X[X.index >= 13500]
y_train = y.loc[X_train.index]
y_test = y.loc[X_test.index]

# Build a model

In [6]:
from sklearn.ensemble import ExtraTreesRegressor

In [7]:
et = ExtraTreesRegressor(n_jobs=-1, n_estimators=500)
et.fit(X_train, y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
          oob_score=False, random_state=None, verbose=0, warm_start=False)

# Evaluate model

In [8]:
def get_scores(x, y, group, threshold=0.50):
    corr = spearmanr(x, y)[0]
    mu_a = x.groupby(group).mean()
    mu_p = y.groupby(group).mean()
    mu_a_top = mu_a[mu_a.rank(ascending=False) / len(mu_a) < threshold]
    mu_p_top = mu_p[mu_p.rank(ascending=False) / len(mu_p) < threshold]
    overlap = mu_p_top.index.isin(mu_a_top.index).mean()
    return pd.Series([corr, overlap])

# Save your model

In [9]:
from api.utils.models import TWCModel

In [10]:
model = TWCModel()
model.transformer = transformer
model.model = et
model.save('etmodel.p')

In [11]:
model2 = TWCModel()

In [12]:
model2.load('etmodel.p')

### Test your model works on json test file

In [13]:
with open('../api/twc_sample_request.json') as f:
    json_string = f.read()

In [14]:
p = ParseJSONToTablesTransformer()
example_tables = p.fit_transform(json_string)

In [17]:
model2.transformer.fit_transform(example_tables)

100%|██████████| 25/25 [00:00<00:00, 50.95it/s]


(    Referral_LivingWithPartner  ReferralIssue_1  ReferralIssue_2  \
 0                            0              NaN              NaN   
 1                            0              NaN              NaN   
 2                            0              NaN              1.0   
 3                            0              NaN              NaN   
 4                            0              NaN              NaN   
 5                            0              NaN              NaN   
 6                            0              NaN              NaN   
 7                            0              NaN              NaN   
 8                            0              NaN              NaN   
 9                            0              NaN              NaN   
 10                           0              NaN              NaN   
 11                           0              NaN              NaN   
 12                           0              NaN              NaN   
 13                           0   