# Main Code

## bert-base-uncased

In [1]:
import pandas as pd
import re
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk
from collections import defaultdict
from transformers import pipeline, BertTokenizer, BertForSequenceClassification
from nltk import RegexpParser
import warnings
warnings.filterwarnings("ignore")  # Ignore all warnings
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\990754\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Read the data into a DataFrame
dt = pd.read_csv(r"C:\Users\990754\Downloads\sentiment-analysis-python\data.csv")
dt.head(4)

Unnamed: 0,Hospital,username,reviews,stars_given,Branch location,Time
0,Popular times,srividya ande,I am very much satisfied by outstanding dental...,5,Telangana 500072,a week ago
1,Popular times,indrani das,My mom is very much satisfied by the dental tr...,5,Telangana 500072,a week ago
2,Popular times,Eswaramma Eswari,My daughter is six months into her Invisalign ...,5,Telangana 500072,2 weeks ago
3,Popular times,Mahesh Sainathuni,If someone is looking for a dental care or tre...,5,Telangana 500072,9 months ago


## Data Preprocessing

In [6]:
# finding the columns having null values
dt.isnull().sum()

Hospital           0
username           0
reviews            0
stars_given        0
Branch location    0
Time               0
dtype: int64

In [7]:
# Drop rows where 'reviews' column is null
dt = dt.dropna(subset=['reviews'])

In [8]:
# Drop rows where 'Hospital' column has the value "Web results"
dt = dt[dt['Hospital'] != 'Web results']

In [9]:
dt.tail(4)

Unnamed: 0,Hospital,username,reviews,stars_given,Branch location,Time
19193,Pediatric Outpatient Department - Rainbow Chil...,Santosh Thirumani,good service,5,Telangana 500074,10 months ago
19194,Pediatric Outpatient Department - Rainbow Chil...,venkatkistareddy gaveni,Good,5,Telangana 500074,10 months ago
19195,Pediatric Outpatient Department - Rainbow Chil...,Mahesh kumar,Good,5,Telangana 500074,10 months ago
19196,Pediatric Outpatient Department - Rainbow Chil...,Charan cherry,K goodIndhu,4,Telangana 500074,10 months ago


## Model Creation

In [10]:
# Load pre-trained sentiment analysis model
sentiment_analysis = pipeline("sentiment-analysis")

# Define the model name
MODEL_NAME = "bert-base-uncased"

# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Load pre-trained model for sequence classification
model = BertForSequenceClassification.from_pretrained(MODEL_NAME)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Define function to extract sentiment of a review
def get_review_sentiment(review):
    # Truncate the review to the maximum input length supported by the model (512 tokens)
    truncated_review = review[:512]
    # Perform sentiment analysis
    result = sentiment_analysis(truncated_review)
    return result[0]['label']

In [13]:

def extract_aspects(sentence, sentiment):
    tokenized_sentence = word_tokenize(sentence)
    tagged_words = pos_tag(tokenized_sentence)
    
    # Define a regular expression pattern for detecting noun phrases with associated modifiers
    grammar = r"""
        NP: {<DT|JJ.*|NN.*>+}      # Chunk sequences of determiner, adjective, and noun
        VB: {<VB.*>}                # Chunk sequences of verbs
        RB: {<RB.*>}                # Chunk sequences of adverbs
    """
    parser = RegexpParser(grammar)
    parsed_sentence = parser.parse(tagged_words)
    
    nouns = defaultdict(list)
    adjectives = defaultdict(list)
    verbs = defaultdict(list)
    adverbs = defaultdict(list)
    
    for subtree in parsed_sentence.subtrees():
        if subtree.label() == 'NP':
            noun = ""
            adjectives_list = []
            for word, pos in subtree.leaves():
                if pos.startswith("NN"):  # Nouns
                    noun += word + " "
                elif pos.startswith("JJ"):  # Adjectives
                    adjectives_list.append(word)
            noun = noun.strip()
            if adjectives_list:
                adjectives[noun] = adjectives_list
            else:
                nouns[noun] = []
        elif subtree.label() == 'VB':  # Verbs
            verb = " ".join(word for word, pos in subtree.leaves())
            verbs[verb] = []
        elif subtree.label() == 'RB':  # Adverbs
            adverb = " ".join(word for word, pos in subtree.leaves())
            adverbs[adverb] = []
    
    return nouns, adjectives, verbs, adverbs

In [None]:
#function to process each review
def process_review(review):
    if not review or not review.strip():  # Check if review is None or contains only whitespace
        return "UNKNOWN", defaultdict(list), defaultdict(list)
    
    sentiment = get_review_sentiment(review)
    parts = re.split(r'[,.]', review)
    sustain_part = defaultdict(list)
    improve_part = defaultdict(list)
    
    for part in parts:
        if not part or not part.strip():  # Skip empty parts or parts containing only whitespace
            continue
        
        part_sentiment = get_review_sentiment(part)
        nouns, adjectives, verbs, adverbs = extract_aspects(part, part_sentiment)
        
        for noun, adj_list in adjectives.items():
            if noun.strip() and adj_list:  # Check if both noun and adjectives are non-empty
                if sentiment == "POSITIVE" and part_sentiment == "POSITIVE":
                    sustain_part[noun].extend(adj_list)
                elif sentiment == "NEGATIVE" and part_sentiment == "NEGATIVE":
                    improve_part[noun].extend(adj_list)
        
        for noun, verb_list in verbs.items():
            if noun.strip() and verb_list:  # Check if both noun and verbs are non-empty
                if sentiment == "POSITIVE" and part_sentiment == "POSITIVE":
                    sustain_part[noun].extend(verb_list)
                elif sentiment == "NEGATIVE" and part_sentiment == "NEGATIVE":
                    improve_part[noun].extend(verb_list)
        
        for noun, adv_list in adverbs.items():
            if noun.strip() and adv_list:  # Check if both noun and adverbs are non-empty
                if sentiment == "POSITIVE" and part_sentiment == "POSITIVE":
                    sustain_part[noun].extend(adv_list)
                elif sentiment == "NEGATIVE" and part_sentiment == "NEGATIVE":
                    improve_part[noun].extend(adv_list)
    
    # Convert defaultdicts to dictionaries
    sustain_part = dict(sustain_part)
    improve_part = dict(improve_part)
    
    return sentiment, sustain_part, improve_part

In [None]:
# Apply processing to each review
sentiments, sustains, improvements = zip(*dt['reviews'].apply(process_review))

# Add sentiment, sustain, and improvement parts to DataFrame
dt['sentiment_analysis'] = sentiments
dt['sustain'] = sustains
dt['improvement'] = improvements
dt

## Creating Word Cloud column

In [16]:
# Function to extract nouns from a sentence
def extract_nouns(sentence):
    tokenized_sentence = word_tokenize(sentence)
    tagged_words = pos_tag(tokenized_sentence)
    
    named_entities = []
    for chunk in ne_chunk(tagged_words):
        if hasattr(chunk, 'label') and chunk.label():
            named_entities.append(' '.join(c[0] for c in chunk))
    
    nouns = []
    for word, pos in tagged_words:
        if pos.startswith("NN") and not pos.startswith("NNP") and word not in named_entities:  # Exclude proper nouns and named entities
            nouns.append(word)
    
    return nouns

# Function to process each review
def process_review(review):
    if not review or not review.strip():  # Check if review is None or contains only whitespace
        return ""
    
    nouns = extract_nouns(review)
    
    # Remove plural nouns
    nouns = [word for word in nouns if not wn.synsets(word) or not wn.synsets(word)[0].name().split('.')[0].endswith('s')]
    
    return " ".join(nouns)

# Load your DataFrame with the 'reviews' column
# Assuming your DataFrame is named 'dt'
# Add a new column 'wordCloud' filled with extracted nouns
dt['wordCloud'] = dt['reviews'].apply(process_review)

In [17]:
# Save the DataFrame to a new CSV file if needed
dt.to_csv("processed_data.csv", index=False)