In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor

In [2]:
train = pd.read_pickle("data/train.pkl")
test = pd.read_pickle("data/test_by_ID.pkl")

In [3]:
def calculate_pmf(df, idx):
    pmf = pd.Series(0, index=idx)
    counts = df["STATUS"].value_counts()
    pmf[counts.index] = counts / df.shape[0]
    return pmf

# get PMF by ID for training sets
statuses = np.arange(-6,2)
train_by_ID = train.groupby("ID")


In [4]:
pmfs = pd.DataFrame(
    [
        calculate_pmf(group, idx=statuses)
        for _, group in train_by_ID
    ],
    index=set(train["ID"])
)
pmfs

Unnamed: 0,-6,-5,-4,-3,-2,-1,0,1
5112591,0.0,0.0,0.0,0.0,0.071429,0.071429,0.071429,0.785714
5112592,0.0,0.0,0.0,0.0,0.071429,0.071429,0.071429,0.785714
5112593,0.0,0.0,0.0,0.0,0.000000,0.235294,0.647059,0.117647
5112594,0.0,0.0,0.0,0.0,0.000000,0.250000,0.750000,0.000000
5112597,0.0,0.0,0.0,0.0,0.000000,0.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...
5111201,0.0,0.0,0.0,0.0,0.000000,0.642857,0.000000,0.357143
5111203,0.0,0.0,0.0,0.0,0.000000,0.000000,1.000000,0.000000
5111204,0.0,0.0,0.0,0.0,0.000000,0.888889,0.000000,0.111111
5111205,0.0,0.0,0.0,0.0,0.000000,1.000000,0.000000,0.000000


In [5]:
pmfs.index.name = "ID"
pmfs

Unnamed: 0_level_0,-6,-5,-4,-3,-2,-1,0,1
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5112591,0.0,0.0,0.0,0.0,0.071429,0.071429,0.071429,0.785714
5112592,0.0,0.0,0.0,0.0,0.071429,0.071429,0.071429,0.785714
5112593,0.0,0.0,0.0,0.0,0.000000,0.235294,0.647059,0.117647
5112594,0.0,0.0,0.0,0.0,0.000000,0.250000,0.750000,0.000000
5112597,0.0,0.0,0.0,0.0,0.000000,0.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...
5111201,0.0,0.0,0.0,0.0,0.000000,0.642857,0.000000,0.357143
5111203,0.0,0.0,0.0,0.0,0.000000,0.000000,1.000000,0.000000
5111204,0.0,0.0,0.0,0.0,0.000000,0.888889,0.000000,0.111111
5111205,0.0,0.0,0.0,0.0,0.000000,1.000000,0.000000,0.000000


In [6]:
len(set(train["ID"])), pmfs.shape

(36287, (36287, 8))

In [7]:
predictors = [
    "CODE_GENDER",
    "FLAG_OWN_CAR",
    "FLAG_OWN_REALTY",
    "CNT_CHILDREN",
    "AMT_INCOME_TOTAL",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_HOUSING_TYPE",
    "DAYS_BIRTH",
    "DAYS_EMPLOYED",
    "FLAG_MOBIL",
    "FLAG_WORK_PHONE",
    "FLAG_PHONE",
    "FLAG_EMAIL",
    "OCCUPATION_TYPE",
    "CNT_FAM_MEMBERS"
]

train_by_ID = train[["ID", "STATUS"]+predictors].drop_duplicates("ID")
train_by_ID.set_index("ID", inplace=True)
train_by_ID

Unnamed: 0_level_0,STATUS,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
5143180,1,M,Y,N,0,225000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-17473,-2631,1,0,0,0,Managers,1.0
5115801,-1,F,N,Y,0,202500.0,Working,Secondary / secondary special,Single / not married,House / apartment,-10752,-590,1,1,0,0,,1.0
5115803,-1,F,N,Y,0,202500.0,Working,Secondary / secondary special,Single / not married,House / apartment,-10752,-590,1,1,0,0,,1.0
5115805,-1,F,N,Y,0,202500.0,Working,Secondary / secondary special,Single / not married,House / apartment,-10752,-590,1,1,0,0,,1.0
5115806,1,F,N,Y,0,202500.0,Working,Secondary / secondary special,Single / not married,House / apartment,-10752,-590,1,1,0,0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5118331,-2,F,N,N,7,157500.0,Working,Secondary / secondary special,Married,House / apartment,-13827,-1649,1,1,1,0,Cleaning staff,9.0
5061211,-1,M,Y,Y,14,225000.0,Working,Secondary / secondary special,Separated,House / apartment,-17754,-1689,1,0,0,0,Drivers,15.0
5061207,0,M,Y,Y,14,225000.0,Working,Secondary / secondary special,Separated,House / apartment,-17754,-1689,1,0,0,0,Drivers,15.0
5061210,-1,M,Y,Y,14,225000.0,Working,Secondary / secondary special,Separated,House / apartment,-17754,-1689,1,0,0,0,Drivers,15.0


In [13]:
dtr_byid = DecisionTreeRegressor()
dtr_byid.fit(X=pd.get_dummies(train_by_ID.loc[:, predictors]), y=pmfs)

actual_pmfs = pd.read_pickle("model_evaluation/actual_pmfs.pkl").values
pred_pmfs = dtr_byid.predict(pd.get_dummies(test[predictors]))
pred_pmfs_normed = pred_pmfs / np.sum(pred_pmfs)
model_msqe = np.sqrt(np.mean((pred_pmfs - actual_pmfs)**2, axis=0))

nullhypo_preds = pd.Series(0, index=statuses)
pcts = train_by_ID["STATUS"].value_counts() / train_by_ID.shape[0]
nullhypo_preds[pcts.index] = pcts
nullhypo_preds.sort_index(inplace=True)
nullhypo_msqe = np.sqrt(np.mean((nullhypo_preds.values - actual_pmfs)**2, axis=0))

In [14]:
model_msqe, nullhypo_msqe

(array([0.03052437, 0.00816284, 0.00986344, 0.01792789, 0.08210412,
        0.44012571, 0.39650742, 0.42741394]),
 array([0.02763189, 0.0079183 , 0.00953302, 0.01698319, 0.07643279,
        0.39024408, 0.34973879, 0.37957839]))

Regressor consistently worse than the null hypothesis when training by ID.  Let's try letting each month count as its own row instead.

In [8]:
for status in statuses:
    train[status] = 0.0
train.set_index("ID", inplace=True)

In [9]:
train.loc[:, statuses] = pmfs
train

Unnamed: 0_level_0,Unnamed: 0,MONTHS_BALANCE,STATUS,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,...,OCCUPATION_TYPE,CNT_FAM_MEMBERS,-6,-5,-4,-3,-2,-1,0,1
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5143180,724670,-1,1,M,Y,N,0,225000.0,Commercial associate,Secondary / secondary special,...,Managers,1.0,0.0,0.0,0.0,0.0,0.000000,0.333333,0.000000,0.666667
5115801,587357,-23,-1,F,N,Y,0,202500.0,Working,Secondary / secondary special,...,,1.0,0.0,0.0,0.0,0.0,0.166667,0.666667,0.166667,0.000000
5115801,587361,-27,-1,F,N,Y,0,202500.0,Working,Secondary / secondary special,...,,1.0,0.0,0.0,0.0,0.0,0.166667,0.666667,0.166667,0.000000
5115803,587393,-43,-1,F,N,Y,0,202500.0,Working,Secondary / secondary special,...,,1.0,0.0,0.0,0.0,0.0,0.000000,1.000000,0.000000,0.000000
5115805,587432,-38,-1,F,N,Y,0,202500.0,Working,Secondary / secondary special,...,,1.0,0.0,0.0,0.0,0.0,0.000000,0.250000,0.041667,0.708333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105054,523726,-4,-1,F,N,Y,19,112500.0,Working,Secondary / secondary special,...,Waiters/barmen staff,20.0,0.0,0.0,0.0,0.0,0.000000,1.000000,0.000000,0.000000
5105054,523724,-2,-1,F,N,Y,19,112500.0,Working,Secondary / secondary special,...,Waiters/barmen staff,20.0,0.0,0.0,0.0,0.0,0.000000,1.000000,0.000000,0.000000
5105054,523725,-3,-1,F,N,Y,19,112500.0,Working,Secondary / secondary special,...,Waiters/barmen staff,20.0,0.0,0.0,0.0,0.0,0.000000,1.000000,0.000000,0.000000
5105054,523727,-5,-1,F,N,Y,19,112500.0,Working,Secondary / secondary special,...,Waiters/barmen staff,20.0,0.0,0.0,0.0,0.0,0.000000,1.000000,0.000000,0.000000


In [10]:
train.to_pickle("data/train_w_pmfs.pkl")

In [20]:
dtr = DecisionTreeRegressor()
dtr.fit(X=pd.get_dummies(train.loc[:, predictors]), y=train.loc[:, statuses])

pred_pmfs = dtr.predict(pd.get_dummies(test[predictors]))
pred_pmfs_normed = pred_pmfs / np.sum(pred_pmfs)
actual_pmfs = pd.read_pickle("model_evaluation/actual_pmfs.pkl").values
model_msqe = np.sqrt(np.mean((pred_pmfs - actual_pmfs)**2, axis=0))

nullhypo_preds = pd.Series(0, index=statuses)
pcts = train["STATUS"].value_counts() / train.shape[0]
nullhypo_preds[pcts.index] = pcts
nullhypo_msqe = np.sqrt(np.mean((nullhypo_preds.values - actual_pmfs)**2, axis=0))

model_msqe, nullhypo_msqe

(array([0.0311273 , 0.008242  , 0.00987196, 0.01819711, 0.08342693,
        0.45568614, 0.41410282, 0.44536023]),
 array([0.0276366 , 0.00791825, 0.00953134, 0.01698369, 0.07645037,
        0.39924717, 0.34991572, 0.39188796]))

The regression model still performs worse than the null hypothesis.

Try a list of XGBoostRegressors, one for each status.

In [21]:
from xgboost import XGBRegressor

xg_regressors = {
    target_status: XGBRegressor(n_estimators=10)
    for target_status in range(-6,2)
}

for target_status, regressor in xg_regressors.items():
    print("training...")
    regressor.fit(X=pd.get_dummies(train.loc[:, predictors]), y=train.loc[:, target_status])

training...
training...
training...
training...
training...
training...
training...
training...


In [26]:
predictions = np.array([
    regressor.predict(pd.get_dummies(test[predictors]))
    for regressor in xg_regressors.values()
]).T

In [28]:
np.sqrt(np.mean((predictions - actual_pmfs)**2, axis=0))

array([0.03181428, 0.01615155, 0.01700233, 0.02216119, 0.07815231,
       0.39183402, 0.35184989, 0.38114774])

This also performs worse than the null hypothesis.

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()
gbr.fit()