In [11]:
import nltk
nltk.download('stopwords')

from helpers.tokenizer import tokenize, ngrams
from collections import defaultdict
import plotly.express as px
import pandas as pd

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:

def ngram_sentiment_stats(df, min_reviews = 50, n = 1):
    """
    Statistical calculations for n-grams from reviews:
    - Count the frequency of each n-gram (at least min_reviews times).
    - Calculate the positivity rate (the percentage of positive reviews containing that n-gram).

    :param df: DataFrame containing reviews (columns 'review' and 'voted_up').
    :param min_reviews: Minimum number of reviews to consider for n-gram review.
    :param n: Length of n-gram (1: unigram, 2: bigram, etc.).
    :return: DataFrame with columns 'ngram', 'count', 'positivity'.
    """

    if df.empty:
        raise ValueError("DataFrame reviews are empty! Cannot perform calculations.")

    ngram_counts = defaultdict(int) # Count the total number of occurrences of n-gram
    ngram_positive = defaultdict(int) # Count the number of occurrences in positive reviews

    for _, row in df.iterrows():
        text = row["review"] or "" # Get review content, if None then ""
        tokens = tokenize(text) # Tokenize text (assuming stopwords and punctuation are removed)
        grams = set(ngrams(tokens, n=n)) # Use unique n-grams to avoid duplicate counting within the same review
        for ngram in grams:
            ngram_counts[ngram] += 1
            if row["voted_up"]: # If the review is positive (voted_up == True)
                ngram_positive[ngram] += 1

    records = []

    for ngram, count, in ngram_counts.items():
        if count >= min_reviews:
            positivity = ngram_positive[ngram] / count # Positive rate
            records.append({
                "ngram": ngram,
                "count": count,
                "positivity": positivity
            })

    stats_df = pd.DataFrame(records).sort_values("count", ascending=False)
    print(f"Dataset contains {len(stats_df):,} {n}-grams (min {min_reviews} reviews)")
    return stats_df

In [13]:
import plotly.graph_objects as go

def scatter_plot(ngram_stats, n):
    """
    Draw a scatter plot: Positivity vs. Frequency of n-grams.
    - Point size is proportional to count.
    - Color according to positivity (blue: high, red: low).
    - Add a trend line (moving average) to show the trend.

    :param ngram_stats: DataFrame from ngram_sentiment_stats.
    :param n: Length of n-gram (for the title).
    """

    fig = px.scatter(
        ngram_stats,
        x="count",
        y="positivity",
        size="count",
        hover_data={
            "ngram": True,
            "count": ":,d",
            "positivity": ":.1%"
        },
        log_x=True,
        labels={
            "count": f"Reviews containing {n}-gram",
            "positivity": "Positivity ratio"
        },
        color="positivity",
        color_continuous_scale="RdYlGn",
        range_color=[0, 1],
        size_max=45,
        opacity=0.75,
        template="plotly_dark"
    )

    # Update marker
    fig.update_traces(
        marker=dict(
            sizemin=6,
            line=dict(width=0.5, color="DarkSlateGray")
        )
    )

    fig.update_yaxes(
        tickformat=".0%",
        range=[0, 1.05],
        title_font=dict(size=14),
        tickfont=dict(size=12),
        gridcolor="rgba(255,255,255,0.08)"
    )

    fig.update_xaxes(
        title_font=dict(size=14),
        tickfont=dict(size=12),
        gridcolor="rgba(255,255,255,0.08)",
        dtick=1,
        minor=dict(dtick=1, showgrid=True, griddash="dot", gridcolor="rgba(255,255,255,0.04)")
    )

    fig.update_layout(
        coloraxis_showscale=False,
        width=1100,
        height=650,
        title={
            "text": f"Positivity vs Frequency of {n}-grams (min 50 reviews)",
            "y": 0.96,
            "x": 0.5,
            "xanchor": "center",
            "font": dict(size=18)
        },
        margin=dict(l=60, r=40, t=80, b=60)
    )
    sorted_df = ngram_stats.sort_values("count")
    trend_x = sorted_df["count"]
    trend_y = sorted_df["positivity"].rolling(window=50, min_periods=10, center=True).mean()

    fig.add_trace(
        go.Scatter(
            x=trend_x,
            y=trend_y,
            mode="lines",
            line=dict(color="rgba(255, 255, 255, 0.7)", width=2.5, dash="dash"),
            name="Trend (moving avg)",
            hoverinfo="skip",
            showlegend=True
        )
    )

    fig.show()

In [14]:
from helpers import database

reviews = database.get_reviews(language="english")

In [15]:
n = 1
ngram_stats = ngram_sentiment_stats(reviews, min_reviews=50, n=n)
print(f"Dataset contains {len(ngram_stats)} words")
scatter_plot(ngram_stats, n)

Dataset contains 1,454 1-grams (min 50 reviews)
Dataset contains 1454 words


In [16]:
n = 2
ngram_stats = ngram_sentiment_stats(reviews, min_reviews=50, n=n)
print(f"Dataset contains {len(ngram_stats)} {n}-grams")
scatter_plot(ngram_stats, n)

Dataset contains 340 2-grams (min 50 reviews)
Dataset contains 340 2-grams


In [17]:
n = 3
ngram_stats = ngram_sentiment_stats(reviews, min_reviews=50, n=n)
print(f"Dataset contains {len(ngram_stats)} {n}-grams")
scatter_plot(ngram_stats, n)

Dataset contains 23 3-grams (min 50 reviews)
Dataset contains 23 3-grams


In [18]:
n = 4
ngram_stats = ngram_sentiment_stats(reviews, min_reviews=50, n=n)
print(f"Dataset contains {len(ngram_stats)} {n}-grams")
scatter_plot(ngram_stats, n)

Dataset contains 3 4-grams (min 50 reviews)
Dataset contains 3 4-grams
