In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score, classification_report, r2_score
import pandas as pd
from pathlib import Path
import tensorflow as tf

In [2]:
# Upload "fileclean.csv"

data = Path('sample_data/fileclean.csv')

df_clean = pd.read_csv(data)

# Handling missing values
df_clean = df_clean.dropna()

In [3]:
# Encoding categorical variables
label_encoders = {}
categorical_columns = ['Ownership Type', 'Provider Type', 'County/Parish']

for col in categorical_columns:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

In [4]:
# df_clean['Overall Rating'] = df_clean['Overall Rating'].astype(int)
df_clean.head()

Unnamed: 0,Overall Rating,Ownership Type,Provider Type,Number of Certified Beds,Average Number of Residents per Day,Reported Total Nurse Staffing Hours per Resident per Day,Health Inspection Rating,Number of Substantiated Complaints,Number of Fines,Total Amount of Fines in Dollars,County/Parish,Total Number of Penalties
0,2.0,0,2,57,50.0,4.37137,2.0,0,2,24644.14,534,2
1,4.0,0,2,85,76.9,4.25182,4.0,0,0,0.0,1456,0
2,4.0,6,2,50,45.1,4.59373,4.0,0,0,0.0,738,0
3,2.0,1,2,92,76.9,3.01781,3.0,0,0,0.0,743,0
4,2.0,1,2,103,86.6,3.49256,2.0,1,0,0.0,743,0


In [5]:
# Step 1: Rounding the "Overall Rating" to the nearest integer
df_clean['Reported Total Nurse Staffing Hours per Resident per Day'] = df_clean['Reported Total Nurse Staffing Hours per Resident per Day'].round()

# Step 2: Convert the rounded ratings to integer (optional, but often needed)
df_clean['Reported Total Nurse Staffing Hours per Resident per Day'] = df_clean['Reported Total Nurse Staffing Hours per Resident per Day'].astype(int)

# Step 1: Rounding the "Overall Rating" to the nearest integer
df_clean['Average Number of Residents per Day'] = df_clean['Average Number of Residents per Day'].round()

# Step 2: Convert the rounded ratings to integer (optional, but often needed)
df_clean['Average Number of Residents per Day'] = df_clean['Average Number of Residents per Day'].astype(int)


In [6]:
df_clean.head()

Unnamed: 0,Overall Rating,Ownership Type,Provider Type,Number of Certified Beds,Average Number of Residents per Day,Reported Total Nurse Staffing Hours per Resident per Day,Health Inspection Rating,Number of Substantiated Complaints,Number of Fines,Total Amount of Fines in Dollars,County/Parish,Total Number of Penalties
0,2.0,0,2,57,50,4,2.0,0,2,24644.14,534,2
1,4.0,0,2,85,77,4,4.0,0,0,0.0,1456,0
2,4.0,6,2,50,45,5,4.0,0,0,0.0,738,0
3,2.0,1,2,92,77,3,3.0,0,0,0.0,743,0
4,2.0,1,2,103,87,3,2.0,1,0,0.0,743,0


In [7]:
df_clean.dtypes

Unnamed: 0,0
Overall Rating,float64
Ownership Type,int64
Provider Type,int64
Number of Certified Beds,int64
Average Number of Residents per Day,int64
Reported Total Nurse Staffing Hours per Resident per Day,int64
Health Inspection Rating,float64
Number of Substantiated Complaints,int64
Number of Fines,int64
Total Amount of Fines in Dollars,float64


In [8]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
print(df_clean['Overall Rating'].value_counts())

Overall Rating
1.0    3326
2.0    3091
3.0    2794
5.0    2714
4.0    2475
Name: count, dtype: int64


In [9]:
df_clean['Overall Rating'] = df_clean['Overall Rating'].replace([2, 4], 3)

df_clean['Overall Rating'].value_counts()

Unnamed: 0_level_0,count
Overall Rating,Unnamed: 1_level_1
3.0,8360
1.0,3326
5.0,2714


In [10]:
# Defining the features (X) and target (y)
X = df_clean.drop('Overall Rating', axis=1)
y = df_clean['Overall Rating']

In [11]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Scaling numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Training the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

In [14]:
# Making predictions
y_pred = model.predict(X_test_scaled)

In [22]:
# # Evaluating the model
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# mse, r2

(0.38455361111111114, 0.8155755420130001)

In [30]:
# df_clean.head()

Unnamed: 0,Overall Rating,Ownership Type,Provider Type,Number of Certified Beds,Average Number of Residents per Day,Reported Total Nurse Staffing Hours per Resident per Day,Health Inspection Rating,Number of Substantiated Complaints,Number of Fines,Total Amount of Fines in Dollars,County/Parish,Total Number of Penalties
0,2.0,0,2,57,50.0,4,2.0,0,2,24644.14,534,2
1,4.0,0,2,85,76.9,4,4.0,0,0,0.0,1456,0
2,4.0,6,2,50,45.1,5,4.0,0,0,0.0,738,0
3,2.0,1,2,92,76.9,3,3.0,0,0,0.0,743,0
4,2.0,1,2,103,86.6,3,2.0,1,0,0.0,743,0


In [None]:
# # Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# input_features = 11
# hidden_layer_units_1 =  80
# hidden_layer_units_2 = 30

# nn = tf.keras.models.Sequential()

# # First hidden layer
# nn.add(tf.keras.layers.Dense(units=hidden_layer_units_1, activation="relu", input_dim=input_features))

# # Second hidden layer
# nn.add(tf.keras.layers.Dense(units=hidden_layer_units_2, activation="relu"))

# # Output layer
# nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# # Check the structure of the model
# nn.summary()

In [None]:
# # Compile the model
# nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# # Train the model
# fit_model = nn.fit(X_train_scaled, y_train, epochs=20)

In [None]:
# # Evaluate the model using the test data
# model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [15]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [16]:
#Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [17]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [18]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2"],
    columns=["Predicted 0", "Predicted 1", "Predicted 2"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [19]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,494,177,0
Actual 1,123,1442,83
Actual 2,0,205,356


Accuracy Score : 0.7958333333333333
Classification Report
              precision    recall  f1-score   support

         1.0       0.80      0.74      0.77       671
         3.0       0.79      0.88      0.83      1648
         5.0       0.81      0.63      0.71       561

    accuracy                           0.80      2880
   macro avg       0.80      0.75      0.77      2880
weighted avg       0.80      0.80      0.79      2880



In [21]:
# Visualize the features by importance
df_clean = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
importances_df.set_index(importances_df[1], inplace=True)
importances_df.drop(columns=1, inplace=True)
importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
importances_sorted = importances_df.sort_values(by='Feature Importances')
importances_sorted.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False)

NameError: name 'importances_df' is not defined

In [22]:
# Determine the number of unique values in each column.
print(df_clean.nunique())

0    11
1    11
dtype: int64


In [23]:
print(df_clean['Average Number of Residents per Day'].value_counts())
print(df_clean['Average Number of Residents per Day'].describe())

KeyError: 'Average Number of Residents per Day'

In [None]:
print(df_clean['Number of Certified Beds'].value_counts())
print(df_clean['Number of Certified Beds'].describe())

Number of Certified Beds
120    1106
60      612
99      394
100     367
90      313
       ... 
455       1
369       1
334       1
406       1
307       1
Name: count, Length: 385, dtype: int64
count    14400.000000
mean       107.311181
std         58.926652
min          4.000000
25%         66.000000
50%        100.000000
75%        128.000000
max        843.000000
Name: Number of Certified Beds, dtype: float64


In [None]:
print(df_clean['Reported Total Nurse Staffing Hours per Resident per Day'].value_counts())
print(df_clean['Reported Total Nurse Staffing Hours per Resident per Day'].describe())

Reported Total Nurse Staffing Hours per Resident per Day
3.67755    3
3.54534    3
3.73422    3
3.35894    3
3.29255    3
          ..
7.06658    1
7.25160    1
3.07186    1
3.68329    1
5.87710    1
Name: count, Length: 13945, dtype: int64
count    14400.000000
mean         3.797659
std          0.945666
min          0.025110
25%          3.247630
50%          3.624990
75%          4.130933
max         15.663700
Name: Reported Total Nurse Staffing Hours per Resident per Day, dtype: float64
