In [1]:
import pandas as pd

# Load a dataset
df = pd.read_csv(r"C:\Project_Guvi\MDE93\healthcare_reviews.csv")

#  first few rows of the dataset
print("First Five row of Dataset")
print(df.head())

# Check for missing values
print("\nMissing Values")
print(df.isnull().sum())

# Shape of dataset
print("\nShape of Dataset:",df.shape)

#info
print("\n information of Dataset")
print(df.info())


First Five row of Dataset
                                         Review_Text  Rating
0        I have mixed feelings about my experience.        4
1  The staff was caring and attentive. I couldn't...       5
2        I have mixed feelings about my experience.        5
3        I have mixed feelings about my experience.        5
4  The healthcare provider was excellent. I had a...       3

Missing Values
Review_Text    100
Rating           0
dtype: int64

Shape of Dataset: (1000, 2)

 information of Dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Review_Text  900 non-null    object
 1   Rating       1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB
None


In [3]:
# drop missing value in review_text column
df = df.dropna(subset=['Review_Text'])

# Define a function to categorize ratings into sentiment labels
def sentiment_label(rating):
    if rating >= 4:
        return '2' #Positive
    elif rating == 3:
        return '1' #neutral
    else:
        return '0' # negative

# Apply the function to create a new column 'Sentiment'
df['Sentiment'] = df['Rating'].apply(sentiment_label)


In [5]:
df['Sentiment'].value_counts()

Sentiment
2    388
0    365
1    147
Name: count, dtype: int64

In [7]:
df.to_csv(r"C:\Project_Guvi\MDE93\healthcare_with_sentimentlabel.csv",index =False)

In [7]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to clean and preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    words = word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing to the Review_Text column
df['Cleaned_Review_Text'] = df['Review_Text'].apply(preprocess_text)

# Inspect the cleaned text
print(df[['Review_Text', 'Cleaned_Review_Text']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yuvar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yuvar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yuvar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                         Review_Text  \
0        I have mixed feelings about my experience.    
1  The staff was caring and attentive. I couldn't...   
2        I have mixed feelings about my experience.    
3        I have mixed feelings about my experience.    
4  The healthcare provider was excellent. I had a...   

                              Cleaned_Review_Text  
0                        mixed feeling experience  
1          staff caring attentive couldnt happier  
2                        mixed feeling experience  
3                        mixed feeling experience  
4  healthcare provider excellent great experience  


In [8]:
df.head()

Unnamed: 0,Review_Text,Rating,Sentiment,Cleaned_Review_Text
0,I have mixed feelings about my experience.,4,2,mixed feeling experience
1,The staff was caring and attentive. I couldn't...,5,2,staff caring attentive couldnt happier
2,I have mixed feelings about my experience.,5,2,mixed feeling experience
3,I have mixed feelings about my experience.,5,2,mixed feeling experience
4,The healthcare provider was excellent. I had a...,3,1,healthcare provider excellent great experience


In [9]:
df.to_csv(r"C:\Project_Guvi\MDE93\healthcare_cleaned_dataset.csv",index =False)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=500)

# Apply TF-IDF transformation
X = tfidf.fit_transform(df['Cleaned_Review_Text'])

# Convert to DataFrame for easier inspection
X_tfidf = pd.DataFrame(X.toarray(), columns=tfidf.get_feature_names_out())

# Inspect the transformed data
#print(X_tfidf.head())


### Model Building

In [15]:
from sklearn.model_selection import train_test_split

# Define X and y
X = X_tfidf
y = df['Sentiment']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (630, 34)
Testing set size: (270, 34)


In [31]:
# randomforest without smote
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Initialize the Random Forest model with optimized parameters
model = RandomForestClassifier(
    n_estimators=100,            # Reduce number of trees
    max_depth=20,               # Limit the depth of trees
    min_samples_split=10,       # Minimum samples required to split an internal node
    n_jobs=-1,                  # Utilize all available cores
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=1)
cm = confusion_matrix(y_test,y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")
print(f"Confusion Matrix:\n{cm}")

Accuracy: 0.42592592592592593
Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.45      0.46       122
           1       1.00      0.00      0.00        34
           2       0.40      0.53      0.45       114

    accuracy                           0.43       270
   macro avg       0.62      0.33      0.30       270
weighted avg       0.50      0.43      0.40       270

Confusion Matrix:
[[55  0 67]
 [10  0 24]
 [54  0 60]]


In [17]:
# randomforest with smote
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],           # Number of trees 
    'max_depth': [10, 20, None],              # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],          # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],            # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False],               # Whether bootstrap samples are used when building trees
    'max_features': ['sqrt', 'log2', None]    # Number of features to consider for best split
}

# Initialize the Random Forest model
model = RandomForestClassifier(random_state=42, n_jobs=-1)

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Train the model using GridSearchCV
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best estimator
best_model = grid_search.best_estimator_

# Make predictions on the original test data
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=1)
cm = confusion_matrix(y_test, y_pred)

# Output results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")
print(f"Confusion Matrix:\n{cm}")


Fitting 5 folds for each of 486 candidates, totalling 2430 fits
Best Parameters: {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy: 0.3296296296296296
Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.55      0.49       122
           1       0.12      0.35      0.18        34
           2       0.42      0.09      0.14       114

    accuracy                           0.33       270
   macro avg       0.33      0.33      0.27       270
weighted avg       0.39      0.33      0.31       270

Confusion Matrix:
[[67 44 11]
 [19 12  3]
 [64 40 10]]


In [23]:
# logisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'penalty': [ 'l2',  'none'],  # Penalty (regularization) options
    'solver': ['saga'],  # Solver that supports 'l1' and 'elasticnet'
    'max_iter': [100, 200, 500]  # Number of iterations
}

# Initialize the model
model = LogisticRegression()

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)

# Train the model using GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best estimator
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
cm=confusion_matrix(y_test,y_pred)
print(cm)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'C': 1, 'max_iter': 100, 'penalty': 'none', 'solver': 'saga'}
Accuracy: 0.42592592592592593
Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.45      0.46       122
           1       0.00      0.00      0.00        34
           2       0.40      0.53      0.45       114

    accuracy                           0.43       270
   macro avg       0.29      0.33      0.30       270
weighted avg       0.38      0.43      0.40       270

[[55  0 67]
 [10  0 24]
 [54  0 60]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
