<a href="https://colab.research.google.com/github/Ashish-Sinha07/Ashish-Sinha-programmer-202070/blob/main/Aspect_Based_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# ***`Aspect-Based Sentiment Analysis`***

# **1. Loading JSON Data**

In [None]:
import pandas as pd
import json

# Function to read JSON files
def read_json_file(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Load business and review data
business_df = read_json_file('/content/drive/MyDrive/CSV/dataset/dataset/yelp_academic_dataset_business.json')
review_df = read_json_file('/content/drive/MyDrive/CSV/dataset/dataset/yelp_academic_dataset_review.json')
user_df = read_json_file('/content/drive/MyDrive/CSV/dataset/dataset/yelp_academic_dataset_user.json')
tip_df = read_json_file('/content/drive/MyDrive/CSV/dataset/dataset/yelp_academic_dataset_tip.json')
checkin_df = read_json_file('/content/drive/MyDrive/CSV/dataset/dataset/yelp_academic_dataset_checkin.json')

# Display structure of the data
print(business_df.head())
print(review_df.head())


# **2. Preprocessing the Data**

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

# Function for cleaning text
def clean_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert text to lowercase
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

# Apply cleaning to review text
review_df['clean_text'] = review_df['text'].apply(clean_text)
print(review_df['clean_text'].head())


# **3. Aspect Extraction**

In [None]:
# Define aspect keywords for matching
aspect_keywords = {
    'food_quality': ['food', 'taste', 'flavor', 'dish', 'meal'],
    'service': ['service', 'waiter', 'staff', 'attitude', 'helpful'],
    'ambiance': ['ambiance', 'atmosphere', 'decor', 'environment'],
    'pricing': ['price', 'cost', 'value', 'expensive', 'cheap'],
    'cleanliness': ['clean', 'hygiene', 'tidy', 'dirty', 'sanitary']
}

# Function to extract aspects based on keywords
def extract_aspects(text, aspect_keywords):
    extracted_aspects = []
    for aspect, keywords in aspect_keywords.items():
        for word in keywords:
            if word in text:
                extracted_aspects.append(aspect)
                break
    return extracted_aspects

# Apply aspect extraction
review_df['aspects'] = review_df['clean_text'].apply(lambda x: extract_aspects(x, aspect_keywords))
print(review_df[['text', 'aspects']].head())


# **4. Sentiment Classification**

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function for sentiment classification
def classify_sentiment(text):
    sentiment_score = sia.polarity_scores(text)['compound']
    if sentiment_score >= 0.05:
        return 'positive'
    elif sentiment_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment classification to review text
review_df['sentiment'] = review_df['clean_text'].apply(classify_sentiment)
print(review_df[['text', 'sentiment']].head())


# **5. Combining Aspect and Sentiment**

In [None]:
# Function to assign sentiment to each extracted aspect
def assign_sentiment_to_aspects(aspects, sentiment):
    aspect_sentiments = {}
    for aspect in aspects:
        aspect_sentiments[aspect] = sentiment
    return aspect_sentiments

# Apply aspect sentiment classification
review_df['aspect_sentiments'] = review_df.apply(lambda row: assign_sentiment_to_aspects(row['aspects'], row['sentiment']), axis=1)
print(review_df[['text', 'aspect_sentiments']].head())


# **6. Generate Business Insights**

In [None]:
# Group by business to get overall sentiment distribution
business_sentiment = review_df.explode('aspects').groupby(['business_id', 'aspects']).agg({
    'sentiment': lambda x: x.value_counts().index[0]
}).reset_index()

# Merge with business data to add business details
business_sentiment = pd.merge(business_sentiment, business_df[['business_id', 'name', 'categories']], on='business_id', how='left')
print(business_sentiment.head())

# Generate business insights
def generate_insights(df):
    insights = {}
    for business in df['business_id'].unique():
        business_reviews = df[df['business_id'] == business]
        insights[business] = {
            'name': business_reviews['name'].iloc[0],
            'categories': business_reviews['categories'].iloc[0],
            'aspects': business_reviews.groupby('aspects')['sentiment'].value_counts().to_dict()
        }
    return insights

# Generate insights for all businesses
business_insights = generate_insights(business_sentiment)
print(business_insights)


# **7. Visualization**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example: Sentiment distribution for a specific business and aspect
def plot_sentiment_distribution(business_id, aspect, df):
    business_reviews = df[(df['business_id'] == business_id) & (df['aspects'] == aspect)]
    sns.countplot(x='sentiment', data=business_reviews)
    plt.title(f'Sentiment Distribution for {aspect} - Business {business_id}')
    plt.show()

# Example: Plot for a specific business and aspect
plot_sentiment_distribution('tnhfDv5Il8EaGSXZGiuQGg', 'food_quality', review_df)
