#Health Care Predictions

In [175]:
# libraries needed
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

##Import Dataset

In [176]:
dataURL = 'https://raw.githubusercontent.com/CS133-DataVisualization/term-project-discombobulated-incompacitators-team-3/refs/heads/main/data/2022-23%20financial%20and%20utilization%20report.csv'

In [177]:
health = pd.read_csv(dataURL)
health = health.set_index(['index'])
health

Unnamed: 0_level_0,FAC_NO,FAC_NAME,YEAR_QTR,BEG_DATE,END_DATE,OP_STATUS,COUNTY_NAME,HSA,HFPA,TYPE_CNTRL,...,TOT_OUT_VIS_CC,GROS_INPAT_REV_CC,GROS_OUTPAT_REV_CC,CONTR_ADJ_CC,OTHR_DEDUCT_CC,CAP_PREM_REV_CC,NET_PAT_REV_CC,QA_FEES,QA_SUPPL_PAY,MNGD_CARE_QA_PAY
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,106580996,ADVENTIST HEALTH AND RIDEOUT,20234,10/01/2023,12/31/2023,Open,Yuba,02 - Golden Empire,227,Non Profit Corp.,...,0,0,0,0,0,0,0,0,0,0
2,106150788,ADVENTIST HEALTH BAKERSFIELD,20234,10/01/2023,12/31/2023,Open,Kern,09 - Central,617,Non Profit Corp.,...,0,0,0,0,0,0,0,0,0,0
3,106171049,ADVENTIST HEALTH CLEARLAKE,20234,10/01/2023,12/31/2023,Open,Lake,01 - Northern California,115,Non Profit Corp.,...,0,0,0,0,0,0,0,0,0,0
4,106150706,ADVENTIST HEALTH DELANO,20234,10/01/2023,12/31/2023,Open,Kern,09 - Central,617,Non Profit Corp.,...,0,0,0,0,0,0,0,0,0,0
5,106190323,ADVENTIST HEALTH GLENDALE,20234,10/01/2023,12/31/2023,Open,Los Angeles,11 - Los Angeles,909,Church,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434,106190857,WEST COVINA MEDICAL CENTER,20221,2022-01-01 0:00:00,2022-03-31 0:00:00,Open,Los Angeles,11 - Los Angeles,915,Investor - Corp.,...,0,0,0,0,0,0,0,0,0,0
435,106190859,WEST HILLS HOSPITAL AND MEDICAL CENTER,20221,2022-01-01 0:00:00,2022-03-31 0:00:00,Open,Los Angeles,11 - Los Angeles,905,Investor - Corp.,...,0,0,0,0,0,0,0,0,0,0
436,106190883,WHITTIER HOSPITAL MEDICAL CENTER,20221,2022-01-01 0:00:00,2022-03-31 0:00:00,Open,Los Angeles,11 - Los Angeles,919,Investor - Corp.,...,0,0,0,0,0,0,0,0,0,0
437,106571086,WOODLAND MEMORIAL HOSPITAL,20221,2022-01-01 0:00:00,2022-03-31 0:00:00,Open,Yolo,02 - Golden Empire,313,Non Profit Corp.,...,0,0,0,0,0,0,0,0,0,0


###Explore

In [178]:
# check for null in columns
health.columns[health.isna().any()]

Index(['TEACH_RURL'], dtype='object')

In [179]:
health['TEACH_RURL'].unique()

array([nan, 'Rural', 'Teaching'], dtype=object)

In [180]:
health['TEACH_RURL'].info()

<class 'pandas.core.series.Series'>
Index: 3506 entries, 1 to 438
Series name: TEACH_RURL
Non-Null Count  Dtype 
--------------  ----- 
752 non-null    object
dtypes: object(1)
memory usage: 54.8+ KB


In [181]:
health['NET_TOT'].describe()

Unnamed: 0,NET_TOT
count,3506.0
mean,91321890.0
std,157294600.0
min,-411601.0
25%,10787470.0
50%,38985060.0
75%,118034100.0
max,1868357000.0


##Data Splitting

In [182]:
X = health.drop('NET_TOT', axis=1)
y = health['NET_TOT']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

##Pre-processing

###Feature Engineering

take out year and dates, replace with quarter only

In [183]:
# training ds
X_train['QTR'] = X_train['YEAR_QTR'].apply(
    lambda x: x % 10
)

In [184]:
# testing ds
X_test['QTR'] = X_test['YEAR_QTR'].apply(
    lambda x: x % 10
)

turn zip to numeric

In [185]:
X_train['ZIP'] = X_train['ZIP_CODE'].str.split('-').str[0]
X_train['ZIP'] = pd.to_numeric(X_train['ZIP'])

In [186]:
X_test['ZIP'] = X_test['ZIP_CODE'].str.split('-').str[0]
X_test['ZIP'] = pd.to_numeric(X_test['ZIP'])

drop columns: index, FAC_NAME, YEAR_QTR, BEG_DATE, END_DATE, PHONE, ADDRESS, CEO, TEACH_RURL

In [187]:
X_train.drop(columns=['FAC_NAME', 'YEAR_QTR', 'BEG_DATE', 'END_DATE', 'PHONE', 'ADDRESS', 'CEO', 'TEACH_RURL', 'CITY', 'ZIP_CODE', 'COUNTY_NAME'], inplace=True, axis=1)

In [188]:
X_test.drop(columns=['FAC_NAME', 'YEAR_QTR', 'BEG_DATE', 'END_DATE', 'PHONE', 'ADDRESS', 'CEO', 'TEACH_RURL', 'CITY', 'ZIP_CODE', 'COUNTY_NAME'], inplace=True, axis=1)

###One-Hot Encoding

In [189]:
# training ds
X_train = pd.get_dummies(X_train, columns=['OP_STATUS', 'HSA', 'TYPE_CNTRL', 'TYPE_HOSP'], drop_first=True)

In [190]:
# testing ds
X_test = pd.get_dummies(X_test, columns=['OP_STATUS', 'HSA', 'TYPE_CNTRL', 'TYPE_HOSP'], drop_first=True)

##Random Forest

In [191]:
rf = RandomForestRegressor()
rf = rf.fit(X_train, y_train)

In [192]:
y_train_pred = rf.predict(X_train)
print("Prediction accuracy (R-square) on training data", rf.score(X_train, y_train))

y_test_pred = rf.predict(X_test)
print("R-square on testing data", rf.score(X_test, y_test))

Prediction accuracy (R-square) on training data 0.9984117008603982
R-square on testing data 0.9706748030294473


In [193]:
# 10-fold
cv_scores = cross_val_score(rf, X_train, y_train, cv=10, scoring='r2')

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.98598555 0.99362332 0.97517888 0.98770956 0.98220739 0.99089861
 0.99085981 0.977081   0.99451474 0.99292109]
Mean cross-validation score: 0.9870979947620088
