In [None]:
import loadAndClean
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [None]:
X = loadAndClean.loadAndClean()

In [None]:
y = X['Average Medicare Payments Num']

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5, stratify=np.array(X['DRG Code']))

In [None]:
def crossVal(clf, X, y, cv=3):
    scores = []
    for i in range(cv):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5, stratify=np.array(X['DRG Code']))
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        scores.append(mean_squared_error(y_test, predictions))
        print scores[i]
    print "Average MSE: ", np.mean(scores)

In [None]:
class baseline(object):
    def __init__(self):
        self.has_fit = False
        
    def fit(self, X_train, y_train):
        self.average_value = y_train.mean()
        self.has_fit = True

    def predict(self, X_test):
        if self.has_fit:
            return np.ones((len(X_test),)) * self.average_value
        return None

alg = baseline()
alg.fit(X_train, y_train)
predictions = alg.predict(X_test)
mean_squared_error(y_test, predictions)

In [None]:
class grouped_baseline(object):
    def __init__(self):
        self.has_fit = False

    def fit(self, X_train, y_train):
        X_train = X_train.copy()
        X_train['Cost'] = y_train
        groups = X_train.groupby(['DRG Code'])

        # Average the cost for each DRG
        self.drg_costs = {}
        for ind,data in groups:
            self.drg_costs[ind] = data['Cost'].mean()

        self.has_fit = True

    def predict(self, X_test):
        if self.has_fit:
            return X_test['DRG Code'].apply(lambda x: self.drg_costs[x])
        return None

alg = grouped_baseline()
alg.fit(X_train, y_train)
predictions = alg.predict(X_test)
mean_squared_error(y_test, predictions)

In [None]:
predictors = ['Latitude','Longitude','DRG Code']
alg = RandomForestRegressor(n_estimators=50, n_jobs=4, verbose=3)
alg.fit(X_train[predictors], y_train)
predictions = alg.predict(X_test[predictors])
mean_squared_error(y_test, predictions)

| Model    | MSE
| :---     | --:
|baseline  | 56,454,378.43
|grouped_bl| 8,402,783.52
|RFR       | 5,250,844.69

In [None]:
predictors = ['Latitude','Longitude','DRG Code']

alg.fit(X_train[predictors], y_train)
predictions = alg.predict(X_test[predictors])
mean_squared_error(y_test, predictions)

In [None]:
predictors = ['Latitude','Longitude','DRG Code']
alg = xgb.XGBRegressor(n_estimators=5000)
crossVal(alg,X[predictors], X['Average Medicare Payments Num'])