In [None]:
# Install gdown if not already available
!pip install -q gdown

# Replace with your actual file ID
file_id = '1BSR0BK6wRsr7KfZMkowkp4pNa45pQAm1'  # <- Replace this with your file's ID
output_file = 'selected-file_2012_2017_machinereadable.csv'

# Download using gdown
!gdown --id {file_id} -O {output_file}


Downloading...
From (original): https://drive.google.com/uc?id=1BSR0BK6wRsr7KfZMkowkp4pNa45pQAm1
From (redirected): https://drive.google.com/uc?id=1BSR0BK6wRsr7KfZMkowkp4pNa45pQAm1&confirm=t&uuid=071385bd-7ddd-4a6d-8a43-9d9417a7555b
To: /content/selected-file_2012_2017_machinereadable.csv
100% 205M/205M [00:02<00:00, 92.3MB/s]


In [None]:
import pandas as pd

# Load the downloaded file
df = pd.read_csv(output_file, low_memory=False)
print(df.shape)
df.head()


(578350, 35)


Unnamed: 0,FAC_NO,FAC_NAME,BEG_DATE,END_DATE,DAY_PER,DATA_IND,AUDIT_IND,COUNTY,HSA,HFPA,...,MCAR_PRO#,MCAL_PRO#,REG_MCAL#,BED_LIC,BED_AVL,BED_STF,Variable,Amount,WEB-SITE,ORG-NAME
0,106010735,ALAMEDA HOSPITAL,01JUL2011,30JUN2012,366,Audited,Incl. Ind. Audit Adj.,Alameda,5,417,...,05-0211,HSC00211F,ZZR00211F,161,161,161,DAY_ MCAR_TR,12330.0,,
1,106010735,ALAMEDA HOSPITAL,01JUL2011,30JUN2012,366,Audited,Incl. Ind. Audit Adj.,Alameda,5,417,...,05-0211,HSC00211F,ZZR00211F,161,161,161,DAY_MCAR_MC,885.0,,
2,106010735,ALAMEDA HOSPITAL,01JUL2011,30JUN2012,366,Audited,Incl. Ind. Audit Adj.,Alameda,5,417,...,05-0211,HSC00211F,ZZR00211F,161,161,161,DAY_MCAL_TR,13093.0,,
3,106010735,ALAMEDA HOSPITAL,01JUL2011,30JUN2012,366,Audited,Incl. Ind. Audit Adj.,Alameda,5,417,...,05-0211,HSC00211F,ZZR00211F,161,161,161,DAY_ MCAL_ MC,1786.0,,
4,106010735,ALAMEDA HOSPITAL,01JUL2011,30JUN2012,366,Audited,Incl. Ind. Audit Adj.,Alameda,5,417,...,05-0211,HSC00211F,ZZR00211F,161,161,161,DAY_CNTY,0.0,,


In [None]:
#Select and retain only relevant columns
key_columns = ['FAC_NAME', 'BED_LIC', 'BED_AVL', 'BED_STF', 'COUNTY', 'DAY_PER', 'Variable', 'Amount']
df = df[key_columns]

#Check for null values in these selected columns
null_summary = df.isnull().sum().to_frame(name='Null Count')
null_summary['Percent Missing'] = (null_summary['Null Count'] / len(df)) * 100
null_summary = null_summary[null_summary['Null Count'] > 0]
null_summary


Unnamed: 0,Null Count,Percent Missing
COUNTY,430,0.074349


In [None]:
# Drop rows with missing values (only COUNTY has nulls)
df = df.dropna()

# Confirm new shape
print(f"Shape after dropping nulls: {df.shape}")


Shape after dropping nulls: (577920, 8)


In [None]:
# Convert beds and Amount to numeric
for col in ['BED_LIC', 'BED_AVL', 'BED_STF', 'Amount']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df.dropna(inplace=True)


In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Encode categorical variables
le_fac = LabelEncoder()
le_var = LabelEncoder()
le_county = LabelEncoder()

df['FAC_NAME_ENC'] = le_fac.fit_transform(df['FAC_NAME'])
df['COUNTY_ENC'] = le_county.fit_transform(df['COUNTY'])
df['VAR_ENC'] = le_var.fit_transform(df['Variable'])

In [None]:
# Feature Set
features = ['BED_LIC', 'BED_AVL', 'BED_STF', 'DAY_PER', 'FAC_NAME_ENC', 'COUNTY_ENC', 'VAR_ENC']
X = df[features]
y = df['Amount']

# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
# Import accuracy_score and classification_report
from sklearn.metrics import accuracy_score, classification_report

### Classification Model ###
df['High_Demand'] = (df['Amount'] > df['Amount'].median()).astype(int)
y_class = df['High_Demand']
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_scaled, y_class, test_size=0.3, random_state=42)

classifier = LogisticRegression()
classifier.fit(X_train_cls, y_train_cls)
y_pred_cls = classifier.predict(X_test_cls)

print("CLASSIFICATION MODEL:")
print(f"Accuracy: {accuracy_score(y_test_cls, y_pred_cls):.2f}")
print(classification_report(y_test_cls, y_pred_cls))

CLASSIFICATION MODEL:
Accuracy: 0.60
              precision    recall  f1-score   support

           0       0.60      0.60      0.60     85480
           1       0.60      0.60      0.60     84800

    accuracy                           0.60    170280
   macro avg       0.60      0.60      0.60    170280
weighted avg       0.60      0.60      0.60    170280



In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_cls, y_train_cls)
y_pred_rf = rf.predict(X_test_cls)

print("Random Forest Accuracy:", accuracy_score(y_test_cls, y_pred_rf))
print(classification_report(y_test_cls, y_pred_rf))

Random Forest Accuracy: 0.8353946441155744
              precision    recall  f1-score   support

           0       0.84      0.83      0.83     85480
           1       0.83      0.84      0.84     84800

    accuracy                           0.84    170280
   macro avg       0.84      0.84      0.84    170280
weighted avg       0.84      0.84      0.84    170280



In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the model
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(X_train_cls, y_train_cls)  # Use X_train_cls and y_train_cls

# Predict
y_pred_gb = gb_clf.predict(X_test_cls)  # Use X_test_cls for prediction

# Evaluate
print("Gradient Boosting Classifier Results:")
print("Accuracy:", accuracy_score(y_test_cls, y_pred_gb))  # Use y_test_cls for evaluation
print(classification_report(y_test_cls, y_pred_gb))

Gradient Boosting Classifier Results:
Accuracy: 0.8233027953958186
              precision    recall  f1-score   support

           0       0.85      0.79      0.82     85480
           1       0.80      0.86      0.83     84800

    accuracy                           0.82    170280
   macro avg       0.82      0.82      0.82    170280
weighted avg       0.82      0.82      0.82    170280



### Classification with Outliers: Model Comparison and Objective Alignment

In this extended classification analysis, we retained all outliers to evaluate whether they hold predictive value in forecasting hospital efficiency categories. Three machine learning models were evaluated:

#### 1. *Logistic Regression*
- *Accuracy:* 54.20%
- Performance was relatively poor, indicating the model struggled to find linear boundaries in this noisy, high-variance dataset.
- May be too simplistic for the classification task, especially with outliers included.

#### 2. *Random Forest Classifier*
- *Accuracy:* 83.53%
- Showed significant improvement over logistic regression.
- Demonstrated balanced performance with macro and weighted averages of *0.84* for both precision and recall.
- Its robustness to noise and outliers proved beneficial.

#### 3. *Gradient Boosting Classifier*
- *Accuracy:* 82.33%
- Closely trailed behind Random Forest but maintained competitive metrics (F1-score: 0.82–0.83).
- More sensitive to noisy data, yet still performed strongly.

### Insights:
- Including outliers resulted in *significant performance gains* compared to the earlier version without outliers, where the best accuracy was only ~60%.
- *Random Forest* emerged as the most effective classifier for this problem.
- These findings directly support our classification objective: *to accurately categorize hospitals based on resource allocation patterns*.
- The enhanced precision and recall values indicate this approach could help in early identification of underperforming facilities and resource bottlenecks.

For analysis without outliers, please refer to the earlier classification notebook titled:  
**602_Clustering_&_Classification_Model_Abhinav&Devika.ipynb**