# Research Question: What Drives Hospital Performances? Analysing the Role of Patient Experiences and Predicting Hospital Success in CMS HVBP Score

In [2]:
# Data manipulation and visualization libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical modeling libraries
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Data Exploration

<p> I prefer to explore each dataset one by one because I think it will give me more control to the data </p>

In [3]:
df_clinical_care = pd.read_csv('https://raw.githubusercontent.com/osoliman/Chapter6_HTM737/refs/heads/main/hvbp_clinical_care_11_07_2017.csv')
df_clinical_care.head()

Unnamed: 0,Provider Number,Hospital Name,Address,City,State,ZIP Code,County Name,MORT-30-AMI Achievement Threshold,MORT-30-AMI Benchmark,MORT-30-AMI Baseline Rate,...,MORT-30-HF Achievement Points,MORT-30-HF Improvement Points,MORT-30-HF Measure Score,MORT-30-PN Achievement Threshold,MORT-30-PN Benchmark,MORT-30-PN Baseline Rate,MORT-30-PN Performance Rate,MORT-30-PN Achievement Points,MORT-30-PN Improvement Points,MORT-30-PN Measure Score
0,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,AL,36301,Houston,0.850916,0.873053,0.850186,...,1 out of 10,0 out of 9,1 out of 10,0.88286,0.9079,0.877181,0.893777,4 out of 10,5 out of 9,5 out of 10
1,10005,MARSHALL MEDICAL CENTER SOUTH,2505 U S HIGHWAY 431 NORTH,BOAZ,AL,35957,Marshall,0.850916,0.873053,0.836946,...,0 out of 10,0 out of 9,0 out of 10,0.88286,0.9079,0.85852,0.851953,0 out of 10,0 out of 9,0 out of 10
2,10006,ELIZA COFFEE MEMORIAL HOSPITAL,205 MARENGO STREET,FLORENCE,AL,35631,Lauderdale,0.850916,0.873053,0.818278,...,0 out of 10,0 out of 9,0 out of 10,0.88286,0.9079,0.848835,0.872522,0 out of 10,4 out of 9,4 out of 10
3,10007,MIZELL MEMORIAL HOSPITAL,702 N MAIN ST,OPP,AL,36467,Covington,0.850916,0.873053,0.84393,...,0 out of 10,0 out of 9,0 out of 10,0.88286,0.9079,0.836245,0.849107,0 out of 10,1 out of 9,1 out of 10
4,10011,ST VINCENT'S EAST,50 MEDICAL PARK EAST DRIVE,BIRMINGHAM,AL,35235,Jefferson,0.850916,0.873053,0.83408,...,0 out of 10,0 out of 9,0 out of 10,0.88286,0.9079,0.823954,0.883768,1 out of 10,7 out of 9,7 out of 10


In [None]:
#check the data types and non-null counts
df_clinical_care.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2808 entries, 0 to 2807
Data columns (total 28 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Provider Number                    2808 non-null   int64  
 1   Hospital Name                      2808 non-null   object 
 2   Address                            2808 non-null   object 
 3   City                               2808 non-null   object 
 4   State                              2808 non-null   object 
 5   ZIP Code                           2808 non-null   int64  
 6   County Name                        2807 non-null   object 
 7   MORT-30-AMI Achievement Threshold  2808 non-null   float64
 8   MORT-30-AMI Benchmark              2808 non-null   float64
 9   MORT-30-AMI Baseline Rate          2808 non-null   object 
 10  MORT-30-AMI Performance Rate       2808 non-null   object 
 11  MORT-30-AMI Achievement Points     2808 non-null   objec

In [5]:
df_efficiency = pd.read_csv('https://raw.githubusercontent.com/osoliman/Chapter6_HTM737/refs/heads/main/hvbp_efficiency_11_07_2017.csv')
df_efficiency.head()

Unnamed: 0,Provider_Number,Hospital_Name,Address,City,State,ZIP_Code,County_Name,MSPB-1 Achievement Threshold,MSPB-1 Benchmark,MSPB-1 Baseline Rate,MSPB-1 Performance Rate,MSPB-1 Achievement Points,MSPB-1 Improvement Points,MSPB-1 Measure Score
0,110215,PIEDMONT FAYETTE HOSPITAL,1255 HIGHWAY 54 WEST,FAYETTEVILLE,GA,30214,Fayette,0.985777,0.832678,0.950527,0.962057,2 out of 10,0 out of 9,2 out of 10
1,230236,METRO HEALTH HOSPITAL,"5900 BYRON CENTER AVENUE, SW",WYOMING,MI,49519,Kent,0.985777,0.832678,0.921788,0.939953,3 out of 10,0 out of 9,3 out of 10
2,520019,MINISTRY SAINT MARYS HOSPITAL,2251 NORTH SHORE DR,RHINELANDER,WI,54501,Oneida,0.985777,0.832678,0.843599,0.83279,9 out of 10,9 out of 9,9 out of 10
3,340141,NEW HANOVER REGIONAL MEDICAL CENTER,2131 S 17TH ST BOX 9000,WILMINGTON,NC,28402,New Hanover,0.985777,0.832678,0.966919,0.967548,2 out of 10,0 out of 9,2 out of 10
4,200020,YORK HOSPITAL,15 HOSPITAL DRIVE,YORK,ME,3909,York,0.985777,0.832678,0.966664,0.970079,1 out of 10,0 out of 9,1 out of 10


In [6]:
df_efficiency.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2808 entries, 0 to 2807
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Provider_Number               2808 non-null   int64  
 1   Hospital_Name                 2808 non-null   object 
 2   Address                       2808 non-null   object 
 3   City                          2808 non-null   object 
 4   State                         2808 non-null   object 
 5   ZIP_Code                      2808 non-null   int64  
 6   County_Name                   2807 non-null   object 
 7   MSPB-1 Achievement Threshold  2808 non-null   float64
 8   MSPB-1 Benchmark              2808 non-null   float64
 9   MSPB-1 Baseline Rate          2808 non-null   object 
 10  MSPB-1 Performance Rate       2808 non-null   float64
 11  MSPB-1 Achievement Points     2808 non-null   object 
 12  MSPB-1 Improvement Points     2808 non-null   object 
 13  MSP

In [7]:
df_hcahps = pd.read_csv('https://raw.githubusercontent.com/osoliman/Chapter6_HTM737/refs/heads/main/hvbp_hcahps_11_07_2017.csv')
df_hcahps.head()

Unnamed: 0,Provider Number,Hospital Name,Address,City,State,ZIP Code,County Name,Communication with Nurses Floor,Communication with Nurses Achievement Threshold,Communication with Nurses Benchmark,...,Overall Rating of Hospital Floor,Overall Rating of Hospital Achievement Threshold,Overall Rating of Hospital Benchmark,Overall Rating of Hospital Baseline Rate,Overall Rating of Hospital Performance Rate,Overall Rating of Hospital Achievement Points,Overall Rating of Hospital Improvement Points,Overall Rating of Hospital Dimension Score,HCAHPS Base Score,HCAHPS Consistency Score
0,240040,UNIVERSITY MEDICAL CENTER-MESABI/ MESABA CLINICS,750 EAST 34TH ST,HIBBING,MN,55746,Saint Louis,55.27,78.52,86.68,...,37.67,70.23,84.58,69.2,72.81,2 out of 10,2 out of 9,2 out of 10,29,19
1,450055,ROLLING PLAINS MEMORIAL HOSPITAL,200 E ARIZONA,SWEETWATER,TX,79556,Nolan,55.27,78.52,86.68,...,37.67,70.23,84.58,73.47,79.06,6 out of 10,5 out of 9,6 out of 10,49,20
2,50367,NORTHBAY MEDICAL CENTER,1200 B GALE WILSON BLVD,FAIRFIELD,CA,94533,Solano,55.27,78.52,86.68,...,37.67,70.23,84.58,69.92,71.6,1 out of 10,1 out of 9,1 out of 10,10,13
3,360096,EAST LIVERPOOL CITY HOSPITAL,425 WEST 5TH STREET,EAST LIVERPOOL,OH,43920,Columbiana,55.27,78.52,86.68,...,37.67,70.23,84.58,60.7,70.32,1 out of 10,4 out of 9,4 out of 10,29,16
4,230130,"BEAUMONT HOSPITAL, ROYAL OAK",3601 W THIRTEEN MILE RD,ROYAL OAK,MI,48073,Oakland,55.27,78.52,86.68,...,37.67,70.23,84.58,72.39,73.51,3 out of 10,0 out of 9,3 out of 10,7,14


In [8]:
df_hcahps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2808 entries, 0 to 2807
Data columns (total 73 columns):
 #   Column                                                                   Non-Null Count  Dtype  
---  ------                                                                   --------------  -----  
 0   Provider Number                                                          2808 non-null   int64  
 1   Hospital Name                                                            2808 non-null   object 
 2   Address                                                                  2808 non-null   object 
 3   City                                                                     2808 non-null   object 
 4   State                                                                    2808 non-null   object 
 5   ZIP Code                                                                 2808 non-null   int64  
 6   County Name                                                             

In [9]:
df_safety = pd.read_csv('https://raw.githubusercontent.com/osoliman/Chapter6_HTM737/refs/heads/main/hvbp_safety_11_07_2017.csv')
df_safety.head()

Unnamed: 0,Provider Number,Hospital Name,Address,City,State,ZIP Code,County Name,PSI-90 Achievement Threshold,PSI-90 Benchmark,PSI-90 Baseline Rate,...,HAI-6 Achievement Points,HAI-6 Improvement Points,HAI-6 Measure Score,PC-01 Achievement Threshold,PC-01 Benchmark,PC-01 Baseline Rate,PC-01 Performance Rate,PC-01 Achievement Points,PC-01 Improvement Points,PC-01 Measure Score
0,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,AL,36301,Houston,0.964542,0.709498,1.009846,...,6 out of 10,7 out of 9,7 out of 10,0.020408,0.0,0.000000,0.000000,10 out of 10,0 out of 9,10 out of 10
1,10005,MARSHALL MEDICAL CENTER SOUTH,2505 U S HIGHWAY 431 NORTH,BOAZ,AL,35957,Marshall,0.964542,0.709498,0.79833,...,9 out of 10,8 out of 9,9 out of 10,0.020408,0.0,0.295858,0.021739,0 out of 10,9 out of 9,9 out of 10
2,10006,ELIZA COFFEE MEMORIAL HOSPITAL,205 MARENGO STREET,FLORENCE,AL,35631,Lauderdale,0.964542,0.709498,1.253606,...,5 out of 10,3 out of 9,5 out of 10,0.020408,0.0,0.027027,0.000000,10 out of 10,9 out of 9,10 out of 10
3,10007,MIZELL MEMORIAL HOSPITAL,702 N MAIN ST,OPP,AL,36467,Covington,0.964542,0.709498,1.101822,...,Not Available,Not Available,Not Available,0.020408,0.0,Not Available,Not Available,Not Available,Not Available,Not Available
4,10011,ST VINCENT'S EAST,50 MEDICAL PARK EAST DRIVE,BIRMINGHAM,AL,35235,Jefferson,0.964542,0.709498,0.864237,...,3 out of 10,3 out of 9,3 out of 10,0.020408,0.0,0.058824,0.000000,10 out of 10,9 out of 9,10 out of 10


In [10]:
df_safety.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2808 entries, 0 to 2807
Data columns (total 64 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Provider Number               2808 non-null   int64  
 1   Hospital Name                 2808 non-null   object 
 2   Address                       2808 non-null   object 
 3   City                          2808 non-null   object 
 4   State                         2808 non-null   object 
 5   ZIP Code                      2808 non-null   int64  
 6   County Name                   2807 non-null   object 
 7   PSI-90 Achievement Threshold  2808 non-null   float64
 8   PSI-90 Benchmark              2808 non-null   float64
 9   PSI-90 Baseline Rate          2808 non-null   object 
 10  PSI-90 Performance Rate       2808 non-null   object 
 11  PSI-90 Achievement Points     2808 non-null   object 
 12  PSI-90 Improvement Points     2808 non-null   object 
 13  PSI

In [11]:
df_tps = pd.read_csv('https://raw.githubusercontent.com/osoliman/Chapter6_HTM737/refs/heads/main/hvbp_tps_11_07_2017.csv')
df_tps.head()

Unnamed: 0,Provider Number,Hospital Name,Address,City,State,Zip Code,County Name,Unweighted Normalized Clinical Care Domain Score,Weighted Normalized Clinical Care Domain Score,Unweighted Patient and Caregiver Centered Experience of Care/Care Coordination Domain Score,Weighted Patient and Caregiver Centered Experience of Care/Care Coordination Domain Score,Unweighted Normalized Safety Domain Score,Weighted Safety Domain Score,Unweighted Normalized Efficiency and Cost Reduction Domain Score,Weighted Efficiency and Cost Reduction Domain Score,Total Performance Score
0,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,AL,36301,Houston,43.333333333333,10.833333333333,26.0,6.5,72.857142857143,18.214285714286,0.0,0.0,35.547619047619
1,10005,MARSHALL MEDICAL CENTER SOUTH,2505 U S HIGHWAY 431 NORTH,BOAZ,AL,35957,Marshall,16.666666666667,4.166666666667,36.0,9.0,81.428571428571,20.357142857143,0.0,0.0,33.52380952381
2,10006,ELIZA COFFEE MEMORIAL HOSPITAL,205 MARENGO STREET,FLORENCE,AL,35631,Lauderdale,26.666666666667,6.666666666667,17.0,4.25,47.142857142857,11.785714285714,0.0,0.0,22.702380952381
3,10007,MIZELL MEMORIAL HOSPITAL,702 N MAIN ST,OPP,AL,36467,Covington,26.666666666667,8.888888888889,56.0,18.666666666667,Not Available,Not Available,0.0,0.0,27.555555555556
4,10011,ST VINCENT'S EAST,50 MEDICAL PARK EAST DRIVE,BIRMINGHAM,AL,35235,Jefferson,46.666666666667,11.666666666667,25.0,6.25,42.857142857143,10.714285714286,0.0,0.0,28.630952380952


In [12]:
df_tps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2808 entries, 0 to 2807
Data columns (total 16 columns):
 #   Column                                                                                       Non-Null Count  Dtype 
---  ------                                                                                       --------------  ----- 
 0   Provider Number                                                                              2808 non-null   int64 
 1   Hospital Name                                                                                2808 non-null   object
 2   Address                                                                                      2808 non-null   object
 3   City                                                                                         2808 non-null   object
 4   State                                                                                        2808 non-null   object
 5   Zip Code                                 

Because these data are "filled" so we don't have to care much about missing value except 1 in country, since it is a small problem (1 each) I'll just remove it
<br>
Let think about what column we need to get out of these data
<br>
Since this is a complex dataset we will do the cleaning process on the go, because we don't know which column to choose yet