In [None]:
# NLP Model: Naive Bayes Classification

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load Dataset
df = pd.read_csv("California 2017-2022 Dataset.csv", encoding="latin1")

# Clean and Standardize Rating Column
df_clean = df.copy()
df_clean.columns = df_clean.columns.str.strip()
df_clean["Hospital Ratings"] = df_clean["Hospital Ratings"].str.strip()

# Map Rating Categories to Numeric Values
rating_map = {
    "Worse": 0,
    "As Expected": 1,
    "Better": 2
}
df_clean["rating_num"] = df_clean["Hospital Ratings"].map(rating_map)

print("Original Rating Labels:", df_clean["Hospital Ratings"].unique())
print("Numeric Rating Values:", df_clean["rating_num"].unique())

# Remove rows where ratings are missing
df_clean = df_clean.dropna(subset=["rating_num"])

# Select Text Column for NLP Model
df_clean = df_clean.dropna(subset=["Type of Report"])

# Define Features X and Target Y
X = df_clean["Type of Report"]
y = df_clean["rating_num"]

# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Convert Text into Bag-of-Words Vectors
vectorizer = CountVectorizer(stop_words="english")
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Train Multinomial Naive Bayes Classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)

# Predict on Test Set
y_pred = nb_model.predict(X_test_bow)

# Evaluate Model Performance
print("\nNaive Bayes Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Original Rating Labels: [nan 'As Expected' 'Better' 'Worse']
Numeric Rating Values: [nan  1.  2.  0.]

Naive Bayes Model Accuracy: 0.9127757824525398

Classification Report:

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       165
         1.0       0.91      1.00      0.95      3558
         2.0       0.00      0.00      0.00       175

    accuracy                           0.91      3898
   macro avg       0.30      0.33      0.32      3898
weighted avg       0.83      0.91      0.87      3898



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
