In [2]:
pip install spacy




In [3]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install seaborn




In [5]:
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns


In [6]:
df = pd.read_csv("C:\\Users\\ADMIN\\New folder\\healthcare_reviews (1).csv")

In [7]:
df.isnull().sum()

Review_Text    100
Rating           0
dtype: int64

In [8]:
most_frequent_sentence = df['Review_Text'].mode()[0]
df['Review_Text'].fillna(most_frequent_sentence, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Review_Text'].fillna(most_frequent_sentence, inplace=True)


In [9]:
df.isnull().sum()

Review_Text    0
Rating         0
dtype: int64

In [10]:
# Load the English language model in spaCy
nlp = spacy.load('en_core_web_sm')

# Function to process text using spaCy
def process_text(text):
    # Process the text using spaCy
    doc = nlp(text)
    # Remove stop words, punctuation, and convert to lowercase
    # Lemmatize each token and join them back into a sentence
    lemmatized_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.text.lower()]
    return lemmatized_tokens

# Apply text processing to the Sentences column
df['processed_text'] = df['Review_Text'].apply(process_text)

In [11]:
def analyze_sentiment(text):
    
    positive_words = ['good', 'great', 'excellent','happy','satisfied']
    negative_words = ['bad', 'terrible', 'awful','disappointing']
    
    tokens = process_text(text)
    positive_count = sum(1 for word in tokens if word in positive_words)
    negative_count = sum(1 for word in tokens if word in negative_words)
    
    if positive_count > negative_count:
        return 'positive'
    elif negative_count > positive_count:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment analysis to DataFrame
df['sentiment'] = df['Review_Text'].apply(analyze_sentiment)

In [12]:
df['tokenized_text_str'] = df['processed_text'].apply(lambda tokens: ' '.join(tokens))


In [13]:
df

Unnamed: 0,Review_Text,Rating,processed_text,sentiment,tokenized_text_str
0,I have mixed feelings about my experience.,4,"[mixed, feeling, experience]",neutral,mixed feeling experience
1,The staff was caring and attentive. I couldn't...,5,"[staff, care, attentive, happy]",positive,staff care attentive happy
2,I have mixed feelings about my experience.,5,"[mixed, feeling, experience]",neutral,mixed feeling experience
3,I have mixed feelings about my experience.,5,"[mixed, feeling, experience]",neutral,mixed feeling experience
4,The healthcare provider was excellent. I had a...,3,"[healthcare, provider, excellent, great, exper...",positive,healthcare provider excellent great experience
...,...,...,...,...,...
995,My experience was terrible. I would not recomm...,5,"[experience, terrible, recommend, provider]",negative,experience terrible recommend provider
996,The service was disappointing. I won't be comi...,4,"[service, disappointing, will, come]",negative,service disappointing will come
997,"The service was okay, but nothing exceptional.",3,"[service, okay, exceptional]",neutral,service okay exceptional
998,I have mixed feelings about my experience.,5,"[mixed, feeling, experience]",neutral,mixed feeling experience


In [14]:
X_train, X_test, y_train, y_test = train_test_split(df['tokenized_text_str'], df['sentiment'], test_size=0.2, random_state=42)

In [15]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(800,)
(800,)
(200,)
(200,)


In [16]:
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Train Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_bow, y_train)

# Predict on test set
y_pred = nb_classifier.predict(X_test_bow)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [17]:
# Check data type and shape of X_train
print("Data Type of X_train:", type(X_train))
print("Shape of X_train:", X_train.shape)

X_train_array = X_train.values

# Now, X_train_array is a NumPy array
print("Data Type of X_train_array:", type(X_train_array))
print("Shape of X_train_array:", X_train_array.shape)

y_train_array = y_train.values
print(type(y_train_array))


Data Type of X_train: <class 'pandas.core.series.Series'>
Shape of X_train: (800,)
Data Type of X_train_array: <class 'numpy.ndarray'>
Shape of X_train_array: (800,)
<class 'numpy.ndarray'>


In [18]:
# CROSS VALIDATION
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit-transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Initialize the KFold cross-validation splitter
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize a list to store the accuracy scores for each fold
accuracy_scores = []

# Loop through each fold
for train_index, val_index in kf.split(X_train_tfidf):
    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
    y_train_fold, y_val_fold = y_train_array[train_index], y_train_array[val_index]
    
    # Initialize and train the classifier
    classifier = MultinomialNB()
    classifier.fit(X_train_fold, y_train_fold)
    
    # Predict on the validation set
    y_pred_fold = classifier.predict(X_val_fold)
    
    # Calculate accuracy for this fold and append it to the list
    accuracy = accuracy_score(y_val_fold, y_pred_fold)
    accuracy_scores.append(accuracy)

# Calculate the mean and standard deviation of the accuracy scores
mean_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)

print("Mean Accuracy:", mean_accuracy)
print("Standard Deviation of Accuracy:", std_accuracy)

Mean Accuracy: 1.0
Standard Deviation of Accuracy: 0.0


In [None]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Use the trained classifier to make predictions on the testing data
y_pred_test = classifier.predict(X_test_tfidf)

# Calculate accuracy on the testing data
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Accuracy on Test Data:", accuracy_test)

Accuracy on Test Data: 1.0


In [None]:
report = classification_report(y_test, y_pred_test)

# Print classification report
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00        58
     neutral       1.00      1.00      1.00        78
    positive       1.00      1.00      1.00        64

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [None]:
def plots_of_reviews():
    # Plot a histogram of ratings
    plt.figure(figsize=(8, 6))
    sns.histplot(data=df, x='Rating', bins=5, kde=True)
    plt.title('Distribution of Ratings')
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.show()

    # Plot a countplot of sentiment
    plt.figure(figsize=(8, 6))
    sns.countplot(data=df, x='sentiment')
    plt.title('Count of Sentiments')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.show()

    # Plot a barplot of mean ratings by sentiment
    plt.figure(figsize=(8, 6))
    sns.barplot(data=df, x='sentiment', y='Rating', errorbar=None)
    plt.title('Mean Ratings by Sentiment')
    plt.xlabel('Sentiment')
    plt.ylabel('Mean Rating')
    plt.show()


    plt.figure(figsize=(8, 6))
    scatter_plot = sns.scatterplot(data=df, x='tokenized_text_str', y='Rating')
    plt.title('Tokenized Text Length vs. Rating')
    plt.xlabel('Tokenized Text')
    plt.ylabel('Rating')

    # Rotate x-axis labels
    scatter_plot.set_xticklabels(df['tokenized_text_str'], rotation=45, ha='right')

    plt.show()