Topic Modelling for review data

In [None]:
import json
import pandas as pd

# Load 1000 Yelp review entries
records = []
with open("./data/yelp_academic_dataset_review.json", "r") as f:
    for i, line in enumerate(f):
        if i >= 1000:  # Change this to load more/less
            break
        try:
            record = json.loads(line)
            records.append(record)
        except json.JSONDecodeError:
            print(f"Skipping bad line {i}")

# Convert to DataFrame
df_reviews = pd.DataFrame(records)

# Extract the review text column
documents = df_reviews['text'].dropna().tolist()

In [None]:
documents

In [None]:
#conda install -c conda-forge wordcloud

In [None]:
# run in terminal
# conda install -c conda-forge spacy
# python -m spacy download en_core_web_sm


In [None]:
import json
import pandas as pd
import spacy
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Load spaCy model (install via: pip install spacy && python -m spacy download en_core_web_sm)
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# --- Load example set of 1000 Yelp reviews ---
records = []
with open("./data/yelp_academic_dataset_review.json", "r") as f:
    for i, line in enumerate(f):
        if i >= 1000:
            break
        try:
            record = json.loads(line)
            records.append(record)
        except json.JSONDecodeError:
            print(f"Skipping bad line {i}")

df_reviews = pd.DataFrame(records)
documents = df_reviews['text'].dropna().tolist()

# --- Preprocessing with spaCy ---
def spacy_preprocess(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove non-letters
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and len(token) > 2]
    return ' '.join(tokens)

preprocessed_docs = [spacy_preprocess(doc) for doc in documents]

# --- Vectorize Text ---
vectorizer = CountVectorizer(max_df=0.9, min_df=10, stop_words='english')  # min_df=10 to ignore rare terms
doc_term_matrix = vectorizer.fit_transform(preprocessed_docs)

# --- Fit LDA Model ---
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(doc_term_matrix)

# --- Show Topics ---
def print_topics(model, vectorizer, top_n=10):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"\nTopic #{idx + 1}:")
        print([words[i] for i in topic.argsort()[-top_n:]])

print_topics(lda, vectorizer)

# --- Optional: Word Clouds ---
for idx, topic in enumerate(lda.components_):
    plt.figure()
    plt.imshow(WordCloud(background_color='white')
               .fit_words({vectorizer.get_feature_names_out()[i]: topic[i] for i in topic.argsort()[-15:]}))
    plt.axis("off")
    plt.title(f"Topic #{idx + 1}")
    plt.show()


Sentiment Analysis on review data

In [None]:
# # Download VADER lexicon (only once)
# import nltk
# nltk.download('vader_lexicon')

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import json

# Load reviews (already provided in your code)
records = []
with open("./data/yelp_academic_dataset_review.json", "r") as f:
    for i, line in enumerate(f):
        if i >= 1000:
            break
        try:
            record = json.loads(line)
            records.append(record)
        except json.JSONDecodeError:
            print(f"Skipping bad line {i}")

df_reviews = pd.DataFrame(records)
documents = df_reviews['text'].dropna().tolist()

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Compute sentiment scores
sentiment_scores = [sia.polarity_scores(text) for text in documents]

# Convert scores to DataFrame and combine with original reviews
df_sentiments = pd.DataFrame(sentiment_scores)
df_reviews_sentiment = pd.concat([df_reviews.reset_index(drop=True), df_sentiments], axis=1)

# Add a label for sentiment (positive, neutral, negative)
def label_sentiment(score):
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

df_reviews_sentiment["sentiment"] = df_reviews_sentiment["compound"].apply(label_sentiment)

# Preview results
print(df_reviews_sentiment[["text", "compound", "sentiment"]].head(10))


Visualizations for Sentiment Analysis

In [None]:
import seaborn as sns

# --- Visualization 1: Countplot of sentiment labels ---
plt.figure(figsize=(6, 4))
sns.countplot(
    data=df_reviews_sentiment,
    x="sentiment",
    hue="sentiment",          # Assign x to hue
    order=["positive", "neutral", "negative"],
    palette="Set2",
    legend=False              # Hide redundant legend
)
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Number of Reviews")
plt.tight_layout()
plt.show()

# --- Visualization 2: Histogram of compound scores ---
plt.figure(figsize=(8, 4))
sns.histplot(data=df_reviews_sentiment, x="compound", kde=True, bins=30, color="skyblue")
plt.title("Distribution of Compound Sentiment Scores")
plt.xlabel("Compound Score")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()



Tip instead of review Data

In [None]:
import json
import pandas as pd
import spacy
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Load spaCy model (install via: pip install spacy && python -m spacy download en_core_web_sm)
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# --- Load example set of 1000 Yelp reviews ---
records = []
with open("./data/yelp_academic_dataset_tip.json", "r") as f:
    for i, line in enumerate(f):
        if i >= 1000:
            break
        try:
            record = json.loads(line)
            records.append(record)
        except json.JSONDecodeError:
            print(f"Skipping bad line {i}")

df_reviews = pd.DataFrame(records)
documents = df_reviews['text'].dropna().tolist()

# --- Preprocessing with spaCy ---
def spacy_preprocess(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove non-letters
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and len(token) > 2]
    return ' '.join(tokens)

preprocessed_docs = [spacy_preprocess(doc) for doc in documents]

# --- Vectorize Text ---
vectorizer = CountVectorizer(max_df=0.9, min_df=10, stop_words='english')  # min_df=10 to ignore rare terms
doc_term_matrix = vectorizer.fit_transform(preprocessed_docs)

# --- Fit LDA Model ---
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(doc_term_matrix)

# --- Show Topics ---
def print_topics(model, vectorizer, top_n=10):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"\nTopic #{idx + 1}:")
        print([words[i] for i in topic.argsort()[-top_n:]])

print_topics(lda, vectorizer)

# --- Optional: Word Clouds ---
for idx, topic in enumerate(lda.components_):
    plt.figure()
    plt.imshow(WordCloud(background_color='white')
               .fit_words({vectorizer.get_feature_names_out()[i]: topic[i] for i in topic.argsort()[-15:]}))
    plt.axis("off")
    plt.title(f"Topic #{idx + 1}")
    plt.show()