In [13]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load the data
file_path = 'D:/google_reviews_data/join_csv/modified_dataset.csv'
df = pd.read_csv(file_path)

# Initialize the sentiment analyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Function to get sentiment scores
def get_sentiment(text):
    return sid.polarity_scores(text)

# Apply the function to the 'text' column
df['sentiment'] = df['text'].apply(get_sentiment)

# Extract compound score as the sentiment score
df['sentiment_score'] = df['sentiment'].apply(lambda x: x['compound'])

# Classify sentiment as positive, negative, or neutral
df['sentiment_label'] = df['sentiment_score'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

# Select relevant columns for Power BI
df_powerbi = df[['company', 'rating', 'text', 'date', 'city', 'contributor_id', 'reviews', 'sentiment_score', 'sentiment_label']]

# Save the processed data to a CSV file
output_file_path = 'D:/google_reviews_data/join_csv/processed_dataset.csv'
df_powerbi.to_csv(output_file_path, index=False)

print(f"Processed data saved to {output_file_path}")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Processed data saved to D:/google_reviews_data/join_csv/processed_dataset.csv


In [14]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load the data
file_path = 'D:/google_reviews_data/join_csv/modified_dataset.csv'
df = pd.read_csv(file_path)

# Initialize the sentiment analyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Function to get sentiment scores
def get_sentiment(text):
    return sid.polarity_scores(text)

# Apply the function to the 'text' column
df['sentiment'] = df['text'].apply(get_sentiment)

# Extract compound score as the sentiment score
df['sentiment_score'] = df['sentiment'].apply(lambda x: x['compound'])

# Classify sentiment as positive, negative, or neutral
df['sentiment_label'] = df['sentiment_score'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

# Calculate average ratings and sentiment scores for each company
company_stats = df.groupby('company').agg(
    average_rating=('rating', 'mean'),
    average_sentiment_score=('sentiment_score', 'mean'),
    review_count=('rating', 'count')
).reset_index()

# Identify discrepancies where high ratings do not match sentiment scores
company_stats['rating_sentiment_discrepancy'] = company_stats['average_rating'] - company_stats['average_sentiment_score']

# Save the processed data to a CSV file for Power BI
output_file_path = 'D:/google_reviews_data/join_csv/company_stats.csv'
company_stats.to_csv(output_file_path, index=False)

print(f"Processed data saved to {output_file_path}")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Processed data saved to D:/google_reviews_data/join_csv/company_stats.csv
