In [2]:
import os

# Create a directory for the Kaggle API key
!mkdir -p ~/.kaggle

# This will prompt you to upload your kaggle.json file
from google.colab import files
files.upload()

# Move the uploaded file to the correct directory
!mv kaggle.json ~/.kaggle/

# Set permissions for the kaggle.json file
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [13]:
%%writefile requirements.txt
streamlit
tensorflow
numpy
beautifulsoup4

Writing requirements.txt


In [15]:
%%writefile app.py
import streamlit as st
import tensorflow as tf
import numpy as np
import pickle
import re
from bs4 import BeautifulSoup
from tensorflow.keras.preprocessing.sequence import pad_sequences # Import pad_sequences

# Load the trained model
try:
    model = tf.keras.models.load_model('sentiment_model.keras')
    # Re-compile the model after loading
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
except Exception as e:
    st.error(f"Error loading model: {e}")
    st.stop()

# Load the tokenizer
try:
    with open('tokenizer.pkl', 'rb') as f:
        tokenizer = pickle.load(f)
except FileNotFoundError:
    st.error("Error: tokenizer.pkl not found. Please ensure it's in the same directory as the app.")
    st.stop()
except Exception as e:
    st.error(f"Error loading tokenizer: {e}")
    st.stop()

# Load the maxlen
try:
    with open('maxlen.txt', 'r') as f:
        maxlen = int(f.read())
except FileNotFoundError:
    st.error("Error: maxlen.txt not found. Please ensure it's in the same directory as the app.")
    st.stop()
except Exception as e:
    st.error(f"Error loading maxlen: {e}")
    st.stop()


# Store the class names
class_names = ['Negative', 'Positive']

# Re-define the exact same text-cleaning function
def clean_text(text):
    # a) uses BeautifulSoup to remove any HTML tags
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    # b) converts all text to lowercase
    text = text.lower()
    # c) removes all non-alphabetic characters (but keeps spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

# Streamlit app title
st.title("Hospital Review Sentiment Analyzer")

# Text area for user input
review_text = st.text_area("Enter a hospital review to analyze:")

# Predict button
if st.button("Predict"):
    if review_text:
        # Get the text, clean it, and put it in a list
        cleaned_text = clean_text(review_text)
        text_list = [cleaned_text]

        # Use the loaded tokenizer to convert this list to a sequence
        sequence = tokenizer.texts_to_sequences(text_list)

        # Use pad_sequences to pad the sequence to maxlen
        padded_sequence = pad_sequences(sequence, maxlen=maxlen, padding='post', truncating='post')

        # Get the model.predict() probability
        prediction = model.predict(padded_sequence)[0][0]

        # Determine the sentiment class
        predicted_class_index = 1 if prediction > 0.5 else 0
        predicted_sentiment = class_names[predicted_class_index]

        # Display the result
        if predicted_sentiment == 'Positive':
            st.success(f"Predicted Sentiment: {predicted_sentiment} (Confidence: {prediction:.2f})")
        else:
            st.error(f"Predicted Sentiment: {predicted_sentiment} (Confidence: {1 - prediction:.2f})")
    else:
        st.warning("Please enter a review to analyze.")

Overwriting app.py


In [11]:
# Save the model
model.save('sentiment_model.keras')
print("Model saved to sentiment_model.keras")

Model saved to sentiment_model.keras


In [9]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_padded, y_train, epochs=10, validation_data=(X_test_padded, y_test))

Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 83ms/step - accuracy: 0.6592 - loss: 0.6345 - val_accuracy: 0.7100 - val_loss: 0.6029
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 114ms/step - accuracy: 0.7581 - loss: 0.5555 - val_accuracy: 0.7100 - val_loss: 0.6011
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 93ms/step - accuracy: 0.7548 - loss: 0.5626 - val_accuracy: 0.7100 - val_loss: 0.6019
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 79ms/step - accuracy: 0.7226 - loss: 0.5914 - val_accuracy: 0.7100 - val_loss: 0.6053
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 70ms/step - accuracy: 0.7377 - loss: 0.5772 - val_accuracy: 0.7100 - val_loss: 0.6029
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 71ms/step - accuracy: 0.7381 - loss: 0.5772 - val_accuracy: 0.7100 - val_loss: 0.6015
Epoch 7/10
[1m25/25[0m [32m━━━

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=64))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

# Print the model summary
model.summary()

In [6]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import numpy as np

# Define vocab_size and maxlen
vocab_size = 5000
maxlen = 150

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Tokenizer object
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")

# Fit the Tokenizer on the training reviews
tokenizer.fit_on_texts(X_train)

# Convert training and testing reviews to number sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad both sets of sequences
X_train_padded = pad_sequences(X_train_sequences, maxlen=maxlen, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=maxlen, padding='post', truncating='post')

# Save the Tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Save the Max Length
with open('maxlen.txt', 'w') as f:
    f.write(str(maxlen))

print("Data tokenized and padded successfully.")
print("Tokenizer saved to tokenizer.pkl")
print("Max length saved to maxlen.txt")

Data tokenized and padded successfully.
Tokenizer saved to tokenizer.pkl
Max length saved to maxlen.txt


In [5]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import os

# Assuming the dataset was downloaded to this path as shown in the previous cell's output
dataset_path = '/root/.cache/kagglehub/datasets/junaid6731/hospital-reviews-dataset/versions/1'
csv_file_path = os.path.join(dataset_path, 'hospital.csv')

# Load the hospital.csv file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

# Drop any rows where the 'Feedback' or 'Sentiment Labels' columns are missing.
df.dropna(subset=['Feedback', 'Sentiment Label'], inplace=True)

# Define a text-cleaning function
def clean_text(text):
    # a) uses BeautifulSoup to remove any HTML tags
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    # b) converts all text to lowercase
    text = text.lower()
    # c) removes all non-alphabetic characters (but keeps spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

# Apply this cleaning function to the 'Feedback' column
df['cleaned_feedback'] = df['Feedback'].apply(clean_text)

# Create our final X (the cleaned 'Feedback' column) and y (the 'Sentiment Labels' column).
X = df['cleaned_feedback']
y = df['Sentiment Label']

# Print the value counts for y to see the class distribution.
print("Value counts for Sentiment Labels:")
print(y.value_counts())

Value counts for Sentiment Labels:
Sentiment Label
1    728
0    268
Name: count, dtype: int64


In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("junaid6731/hospital-reviews-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/junaid6731/hospital-reviews-dataset?dataset_version_number=1...


100%|██████████| 39.6k/39.6k [00:00<00:00, 36.5MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/junaid6731/hospital-reviews-dataset/versions/1





In [1]:
!pip install tensorflow keras pandas numpy kaggle scikit-learn beautifulsoup4 pickle5

Collecting pickle5
  Downloading pickle5-0.0.11.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.1/132.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pickle5
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for pickle5 (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for pickle5[0m[31m
[0m[?25h  Running setup.py clean for pickle5
Failed to build pickle5
[31mERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (pickle5)[0m[31m
[0m