In [1]:
# Import the required modules
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

## Step 1: Read in the dataset about the current customers of the startup.

In [2]:
# Read the usage_stats.csv file from the Resources folder into a Pandas DataFrame
df = pd.read_csv(
    Path("../Project-4/NH_ProviderInfo_Aug2024.csv")
)

# Review the DataFrame
display(df.head())
display(df.tail())

Unnamed: 0,CMS Certification Number (CCN),Provider Name,Provider Address,City/Town,State,ZIP Code,Telephone Number,Provider SSA County Code,County/Parish,Ownership Type,...,Number of Citations from Infection Control Inspections,Number of Fines,Total Amount of Fines in Dollars,Number of Payment Denials,Total Number of Penalties,Location,Latitude,Longitude,Geocoding Footnote,Processing Date
0,15009,"BURNS NURSING HOME, INC.",701 MONROE STREET NW,RUSSELLVILLE,AL,35653,2563324110,290,Franklin,For profit - Corporation,...,,2,24644.14,0,2,"701 MONROE STREET NW,RUSSELLVILLE,AL,35653",34.5149,-87.736,,2024-08-01
1,15010,COOSA VALLEY HEALTHCARE CENTER,260 WEST WALNUT STREET,SYLACAUGA,AL,35150,2562495604,600,Talladega,For profit - Corporation,...,0.0,0,0.0,0,0,"260 WEST WALNUT STREET,SYLACAUGA,AL,35150",33.1637,-86.254,,2024-08-01
2,15012,HIGHLANDS HEALTH AND REHAB,380 WOODS COVE ROAD,SCOTTSBORO,AL,35768,2562183708,350,Jackson,Government - County,...,,0,0.0,0,0,"380 WOODS COVE ROAD,SCOTTSBORO,AL,35768",34.6611,-86.047,,2024-08-01
3,15014,EASTVIEW REHABILITATION & HEALTHCARE CENTER,7755 FOURTH AVENUE SOUTH,BIRMINGHAM,AL,35206,2058330146,360,Jefferson,For profit - Individual,...,0.0,0,0.0,0,0,"7755 FOURTH AVENUE SOUTH,BIRMINGHAM,AL,35206",33.5595,-86.722,,2024-08-01
4,15015,PLANTATION MANOR NURSING HOME,6450 OLD TUSCALOOSA HIGHWAY,MC CALLA,AL,35111,2054776161,360,Jefferson,For profit - Individual,...,,0,0.0,0,0,"6450 OLD TUSCALOOSA HIGHWAY,MC CALLA,AL,35111",33.3221,-87.034,,2024-08-01


Unnamed: 0,CMS Certification Number (CCN),Provider Name,Provider Address,City/Town,State,ZIP Code,Telephone Number,Provider SSA County Code,County/Parish,Ownership Type,...,Number of Citations from Infection Control Inspections,Number of Fines,Total Amount of Fines in Dollars,Number of Payment Denials,Total Number of Penalties,Location,Latitude,Longitude,Geocoding Footnote,Processing Date
14820,745021,LINDALE SPECIALTY CARE CENTER,13905 FM 2710,LINDALE,TX,75771,4302602300,892,Smith,For profit - Limited Liability company,...,,3,367971.5,0,3,"13905 FM 2710,LINDALE,TX,75771",32.5191,-95.398,,2024-08-01
14821,745022,WARE MEMORIAL CARE CENTER,1510 S. VAN BUREN ST.,AMARILLO,TX,79101,8063730471,860,Potter,For profit - Corporation,...,,0,0.0,0,0,"1510 S. VAN BUREN ST.,AMARILLO,TX,79101",35.1987,-101.842,,2024-08-01
14822,745038,TIERRA ESTE NURSING AND REHABILITATION CENTER,14300 PEBBLE HILLS BLVD,EL PASO,TX,79938,9159559998,480,El Paso,For profit - Corporation,...,,7,31595.67,0,7,"14300 PEBBLE HILLS BLVD,EL PASO,TX,79938",31.7822,-106.23,,2024-08-01
14823,745039,MIDTOWNE MEADOWS HEALTH AND REHAB,110 DYLAN WAY,MIDLOTHIAN,TX,76065,1111,470,Ellis,For profit - Limited Liability company,...,,5,20013.32,0,5,"110 DYLAN WAY,MIDLOTHIAN,TX,76065",32.4783,-96.982,22.0,2024-08-01
14824,745040,THE SARAH ROBERTS FRENCH HOME,1315 TEXAS AVE,SAN ANTONIO,TX,78201,2107364238,130,Bexar,Non profit - Corporation,...,,7,35733.13,0,7,"1315 TEXAS AVE,SAN ANTONIO,TX,78201",29.4494,-98.534,,2024-08-01


In [3]:
# Create new dataframe with only necessary columns
df_clean = df[['Overall Rating',
               'Ownership Type',
               'Provider Type',
               'Number of Certified Beds',
               'Average Number of Residents per Day',
               'Adjusted Total Nurse Staffing Hours per Resident per Day',
               'Adjusted RN Staffing Hours per Resident per Day',
               'Adjusted Nurse Aide Staffing Hours per Resident per Day',
               'Adjusted LPN Staffing Hours per Resident per Day',
               'Health Inspection Rating',
               'Number of Citations from Infection Control Inspections',
               'Number of Fines',
               'Total Amount of Fines in Dollars',
               'Total Number of Penalties']]
                    

display(df_clean.head())

Unnamed: 0,Overall Rating,Ownership Type,Provider Type,Number of Certified Beds,Average Number of Residents per Day,Adjusted Total Nurse Staffing Hours per Resident per Day,Adjusted RN Staffing Hours per Resident per Day,Adjusted Nurse Aide Staffing Hours per Resident per Day,Adjusted LPN Staffing Hours per Resident per Day,Health Inspection Rating,Number of Citations from Infection Control Inspections,Number of Fines,Total Amount of Fines in Dollars,Total Number of Penalties
0,2.0,For profit - Corporation,Medicare and Medicaid,57,50.0,4.35288,1.317,2.60798,0.42789,2.0,,2,24644.14,2
1,4.0,For profit - Corporation,Medicare and Medicaid,85,76.9,4.34347,0.89213,2.60828,0.84306,4.0,0.0,0,0.0,0
2,4.0,Government - County,Medicare and Medicaid,50,45.1,4.62367,1.08882,2.93756,0.59729,4.0,,0,0.0,0
3,2.0,For profit - Individual,Medicare and Medicaid,92,76.9,3.23306,0.59689,1.76798,0.86819,3.0,0.0,0,0.0,0
4,2.0,For profit - Individual,Medicare and Medicaid,103,86.6,4.22637,0.50214,2.63787,1.08637,2.0,,0,0.0,0


In [4]:
# Remove NaN Rows
df_cleaned = df_clean.dropna()

In [5]:
# Column Names to Use
column_names = ['Overall Rating',
                'Number of Certified Beds',
                'Average Number of Residents per Day',
                'Adjusted Total Nurse Staffing Hours per Resident per Day',
                'Adjusted RN Staffing Hours per Resident per Day',
                'Adjusted Nurse Aide Staffing Hours per Resident per Day',
                'Adjusted LPN Staffing Hours per Resident per Day',
                'Health Inspection Rating',
                'Number of Citations from Infection Control Inspections',
                'Number of Fines',
                'Total Amount of Fines in Dollars',
                'Total Number of Penalties']

# # Scaling the numeric columns
# data_scaled = StandardScaler().fit_transform(df_cleaned[column_names])

# # Creating a DataFrame with the scaled data
# df_scaled_transformed = pd.DataFrame(data_scaled, columns=column_names)

# # Display sample data
# df_scaled_transformed.head()

In [6]:
df_numerical = df_cleaned[column_names]

df_numerical.head()

Unnamed: 0,Overall Rating,Number of Certified Beds,Average Number of Residents per Day,Adjusted Total Nurse Staffing Hours per Resident per Day,Adjusted RN Staffing Hours per Resident per Day,Adjusted Nurse Aide Staffing Hours per Resident per Day,Adjusted LPN Staffing Hours per Resident per Day,Health Inspection Rating,Number of Citations from Infection Control Inspections,Number of Fines,Total Amount of Fines in Dollars,Total Number of Penalties
1,4.0,85,76.9,4.34347,0.89213,2.60828,0.84306,4.0,0.0,0,0.0,0
3,2.0,92,76.9,3.23306,0.59689,1.76798,0.86819,3.0,0.0,0,0.0,0
9,1.0,121,116.5,3.96928,0.26906,2.84637,0.85385,1.0,0.0,0,0.0,0
12,2.0,154,128.5,3.6145,0.52457,2.30828,0.78165,1.0,0.0,1,10065.25,1
13,3.0,78,75.3,4.32665,0.57203,3.09326,0.66137,3.0,0.0,0,0.0,0


In [7]:
#Transform Catagorical Columns
df_owner = pd.get_dummies(df_cleaned["Ownership Type"], dtype=np.int64)

df_owner.head()

Unnamed: 0,For profit - Corporation,For profit - Individual,For profit - Limited Liability company,For profit - Partnership,Government - City,Government - City/county,Government - County,Government - Federal,Government - Hospital district,Government - State,Non profit - Church related,Non profit - Corporation,Non profit - Other
1,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,0,0,0
12,1,0,0,0,0,0,0,0,0,0,0,0,0
13,0,0,1,0,0,0,0,0,0,0,0,0,0


In [8]:
#Transform Catagorical Columns
df_provider = pd.get_dummies(df_cleaned["Provider Type"], dtype=np.int64)

df_provider.head()

Unnamed: 0,Medicaid,Medicare,Medicare and Medicaid
1,0,0,1
3,0,0,1
9,0,0,1
12,0,0,1
13,0,0,1


In [9]:
# Concatenate Numerical and Categorical Columns
df_transformed = pd.concat([df_numerical, df_owner, df_provider], axis=1)

# Display Table
df_transformed.head()

Unnamed: 0,Overall Rating,Number of Certified Beds,Average Number of Residents per Day,Adjusted Total Nurse Staffing Hours per Resident per Day,Adjusted RN Staffing Hours per Resident per Day,Adjusted Nurse Aide Staffing Hours per Resident per Day,Adjusted LPN Staffing Hours per Resident per Day,Health Inspection Rating,Number of Citations from Infection Control Inspections,Number of Fines,...,Government - County,Government - Federal,Government - Hospital district,Government - State,Non profit - Church related,Non profit - Corporation,Non profit - Other,Medicaid,Medicare,Medicare and Medicaid
1,4.0,85,76.9,4.34347,0.89213,2.60828,0.84306,4.0,0.0,0,...,0,0,0,0,0,0,0,0,0,1
3,2.0,92,76.9,3.23306,0.59689,1.76798,0.86819,3.0,0.0,0,...,0,0,0,0,0,0,0,0,0,1
9,1.0,121,116.5,3.96928,0.26906,2.84637,0.85385,1.0,0.0,0,...,0,0,0,0,0,0,0,0,0,1
12,2.0,154,128.5,3.6145,0.52457,2.30828,0.78165,1.0,0.0,1,...,0,0,0,0,0,0,0,0,0,1
13,3.0,78,75.3,4.32665,0.57203,3.09326,0.66137,3.0,0.0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
# Remove NaN Rows
df_transformed_drop = df_transformed.dropna()

df_transformed_drop.head()

Unnamed: 0,Overall Rating,Number of Certified Beds,Average Number of Residents per Day,Adjusted Total Nurse Staffing Hours per Resident per Day,Adjusted RN Staffing Hours per Resident per Day,Adjusted Nurse Aide Staffing Hours per Resident per Day,Adjusted LPN Staffing Hours per Resident per Day,Health Inspection Rating,Number of Citations from Infection Control Inspections,Number of Fines,...,Government - County,Government - Federal,Government - Hospital district,Government - State,Non profit - Church related,Non profit - Corporation,Non profit - Other,Medicaid,Medicare,Medicare and Medicaid
1,4.0,85,76.9,4.34347,0.89213,2.60828,0.84306,4.0,0.0,0,...,0,0,0,0,0,0,0,0,0,1
3,2.0,92,76.9,3.23306,0.59689,1.76798,0.86819,3.0,0.0,0,...,0,0,0,0,0,0,0,0,0,1
9,1.0,121,116.5,3.96928,0.26906,2.84637,0.85385,1.0,0.0,0,...,0,0,0,0,0,0,0,0,0,1
12,2.0,154,128.5,3.6145,0.52457,2.30828,0.78165,1.0,0.0,1,...,0,0,0,0,0,0,0,0,0,1
13,3.0,78,75.3,4.32665,0.57203,3.09326,0.66137,3.0,0.0,0,...,0,0,0,0,0,0,0,0,0,1


## Step 2: Split the data into X and y and then into testing and training sets.

In [11]:
# Split the data into X (features) and y (target)
# The y variable should focus on the target column
y = df_transformed_drop['Overall Rating']

# The X variable should include all features except the target
X = df_transformed_drop.drop(columns=['Overall Rating'])


In [12]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Step 3: Fit a logistic regression classifier.

In [13]:
# Declare a logistic regression model.
# Apply a random_state of 9 to the model
logistic_regression_model = LogisticRegression(solver='saga', max_iter=1000)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)



## Step 4: Create the predicted values for the testing and the training data.

In [14]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)


## Step 5: Print a confusion matrix for the training data.

In [15]:
# Import the model for sklearn confusion matrix
from sklearn.metrics import confusion_matrix

# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[1408   33  154    0    0]
 [1023   53  300    0    0]
 [ 657   64  436    0    0]
 [ 409   64  479    0    0]
 [ 234   79  565    0    0]]


## Step 6: Print a confusion matrix for the testing data.

In [16]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[485  11  38   0   0]
 [360  21 108   0   0]
 [238  19 133   0   0]
 [141  21 128   0   0]
 [ 79  22 182   0   0]]


## Step 7: Print the training classification report.

In [17]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

         1.0       0.38      0.88      0.53      1595
         2.0       0.18      0.04      0.06      1376
         3.0       0.23      0.38      0.28      1157
         4.0       0.00      0.00      0.00       952
         5.0       0.00      0.00      0.00       878

    accuracy                           0.32      5958
   macro avg       0.16      0.26      0.17      5958
weighted avg       0.19      0.32      0.21      5958



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Step 8: Print the testing classification report.

In [18]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

         1.0       0.37      0.91      0.53       534
         2.0       0.22      0.04      0.07       489
         3.0       0.23      0.34      0.27       390
         4.0       0.00      0.00      0.00       290
         5.0       0.00      0.00      0.00       283

    accuracy                           0.32      1986
   macro avg       0.16      0.26      0.17      1986
weighted avg       0.20      0.32      0.21      1986



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Step 9: Answer the following question

> **Question**: How does the performance of the training and test dataset compare?

> **Sample Answer**: Looking at the two classification reports for the training and test data, it looks as if model performance declined--albeit slightly--on the test data. This is to be expected: this is how well the model is performing on data that the model hasn't seen before. If we're still getting strong precision and recall on the test dataset, this is a good indication about how well the model is likely to perform in real life.