# Purpose

I've been out of this data for awhile so lets first get re-acquainted with how this data is formatted. Then move on with the task of predicting hospital X for the final year Y.

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

os.getcwd()
for dirs, full_path, files in os.walk('../data'):
    csvs = [x for x in files if '.csv' in x]
    inpatient_csvs = [x for x in csvs if 'inpatient' in x.lower()]
    outpatient_csvs = [x for x in csvs if 'outpatient' in x.lower()]
    inpatient_csvs.sort()
    outpatient_csvs.sort()
    
inpatient_csvs, outpatient_csvs

(['Medicare_Provider_Charge_Inpatient_DRG100_FY2011.csv',
  'Medicare_Provider_Charge_Inpatient_DRG100_FY2012.csv',
  'Medicare_Provider_Charge_Inpatient_DRG100_FY2013.csv',
  'Medicare_Provider_Charge_Inpatient_DRGALL_FY2014.csv',
  'Medicare_Provider_Charge_Inpatient_DRGALL_FY2015.csv'],
 ['Medicare_Provider_Charge_Outpatient_APC28_CY2015.csv',
  'Medicare_Provider_Charge_Outpatient_APC30_CY2011.csv',
  'Medicare_Provider_Charge_Outpatient_APC30_CY2012.csv',
  'Medicare_Provider_Charge_Outpatient_APC30_CY2013.csv',
  'Medicare_Provider_Charge_Outpatient_APC32_CY2014.csv'])

# Feature extraction

In [2]:
# Inpatient
df = pd.read_csv('../data/'+inpatient_csvs[0])
display(df.sample(3)), df.columns

Unnamed: 0,DRG Definition,Provider Id,Provider Name,Provider Street Address,Provider City,Provider State,Provider Zip Code,Hospital Referral Region (HRR) Description,Total Discharges,Average Covered Charges,Average Total Payments,Average Medicare Payments
64160,292 - HEART FAILURE & SHOCK W CC,190017,OPELOUSAS GENERAL HEALTH SYSTEM,539 EAST PRUDHOMME STREET,OPELOUSAS,LA,70570,LA - Lafayette,92,17970.75,6782.380435,6093.554348
129065,638 - DIABETES W CC,420043,UPSTATE CAROLINA MEDICAL CENTER,1530 N LIMESTONE ST,GAFFNEY,SC,29340,SC - Spartanburg,13,28542.30769,4895.384615,3761.769231
19653,189 - PULMONARY EDEMA & RESPIRATORY FAILURE,210057,SHADY GROVE ADVENTIST HOSPITAL,9901 MEDICAL CTR DR,ROCKVILLE,MD,20850,DC - Washington,34,25243.05882,23728.41176,22906.26471


(None, Index(['DRG Definition', 'Provider Id', 'Provider Name',
        'Provider Street Address', 'Provider City', 'Provider State',
        'Provider Zip Code', 'Hospital Referral Region (HRR) Description',
        'Total Discharges', 'Average Covered Charges', 'Average Total Payments',
        'Average Medicare Payments'],
       dtype='object'))

# Data description

For Inpatient data, each row is an observed hospital's avg covered charges, avg total payments and avg medicare payments for that fiscal year. My model will utilize features that consist of each of those quantitative features extracted and used as a column in the training df.

In [8]:
def inpatientFeatureExtractor(fydf):
    """Extract features for every hospital for that whole fiscal year."""
    hospitals = fydf['Provider Id'].drop_duplicates().tolist()
    all_hosp_df = pd.DataFrame()
    for hosp_id in tqdm_notebook(hospitals):
        hosp_id_values = fydf[fydf['Provider Id']==hosp_id]
        hosp_id_values.loc[:,'drg'] = hosp_id_values['DRG Definition'].str.split(" - ").str[0].astype(int)
        hosp_id_values = hosp_id_values[['drg','Total Discharges','Average Covered Charges','Average Total Payments','Average Medicare Payments']]
        hosp_id_values.columns = ['drg','total_discharges','avg_covered_charges','avg_total_payments','avg_medicare_payments']
        drg_df = pd.DataFrame()
        for drg in hosp_id_values['drg']:
            drg_vals = hosp_id_values[hosp_id_values['drg']==drg]
            drg_df.loc[hosp_id,str(drg)+'_total_discharges_imp'] = drg_vals['total_discharges'].values[0]
            drg_df.loc[hosp_id,str(drg)+'_avg_covered_charges_imp'] = drg_vals['avg_covered_charges'].values[0]
            drg_df.loc[hosp_id,str(drg)+'_avg_total_payments_imp'] = drg_vals['avg_total_payments'].values[0]
            drg_df.loc[hosp_id,str(drg)+'_avg_medicare_payments_imp'] = drg_vals['avg_medicare_payments'].values[0]
        all_hosp_df = pd.concat([all_hosp_df,drg_df],axis=1,sort=False)
    return all_hosp_df

#inpatientFeatureExtractor(df)

In [9]:
test_extract_df = inpatientFeatureExtractor(df.iloc[:5000,:])

HBox(children=(IntProgress(value=0, max=2078), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


KeyboardInterrupt: 

In [11]:
df[["Provider Id","DRG Definition","Total Discharges","Average Covered Charges","Average Total Payments","Average Medicare Payments"]].stack()

0       Provider Id                                                     10001
        DRG Definition               039 - EXTRACRANIAL PROCEDURES W/O CC/MCC
        Total Discharges                                                   91
        Average Covered Charges                                       32963.1
        Average Total Payments                                        5777.24
        Average Medicare Payments                                     4763.74
1       Provider Id                                                     10005
        DRG Definition               039 - EXTRACRANIAL PROCEDURES W/O CC/MCC
        Total Discharges                                                   14
        Average Covered Charges                                       15131.9
        Average Total Payments                                        5787.57
        Average Medicare Payments                                     4976.71
2       Provider Id                                             