In [4]:
# Install necessary libraries in Google Colab
!pip install dash
!pip install dash-bootstrap-components
!pip install praw
!pip install nltk
!pip install plotly
!pip install dash-table
!pip install transformers==4.44.2



In [5]:
# Import Libraries
import dash
import dash_bootstrap_components as dbc
from dash import dcc, html
from dash.dependencies import Input, Output
import pandas as pd
import plotly.graph_objs as go
import praw
from datetime import datetime
from difflib import SequenceMatcher
from transformers import pipeline
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from dash import dash_table
import numpy as np

# Download NLTK data if not already present
nltk.download('punkt')
nltk.download('stopwords')

# Initialize Hugging Face sentiment and emotion analysis pipelines
sentiment_analyzer = pipeline("sentiment-analysis")
emotion_analyzer = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)

# Reddit API credentials
reddit_client_id = "ByCl4gRNip8r8z7HNINVDQ"
reddit_client_secret = "NRExQ6tUk3mcaWcbqJ4OcUgxJOuHGg"
reddit_user_agent = "my_reddit_app:v1.0"

# Initialize the Reddit client
reddit = praw.Reddit(client_id=reddit_client_id,
                     client_secret=reddit_client_secret,
                     user_agent=reddit_user_agent)

# Initialize the Dash app
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Define the layout of the app
app.layout = dbc.Container([
    dbc.Row([
        dbc.Col(html.H1("Reddit Discussion Analysis"), width={'size': 6, 'offset': 3})
    ]),
    dbc.Row([
        dbc.Col(dcc.Input(id='keyword-input', type='text', value='Paris Olympics', placeholder='Enter keyword'),
                width=6),
        dbc.Col(html.Button('Search', id='search-button', n_clicks=0), width=2)
    ]),
    dbc.Row([
        dbc.Col(dcc.Graph(id='sentiment-graph'), width=6),
        dbc.Col(dcc.Graph(id='pie-chart'), width=6)
    ]),
    dbc.Row([
        dbc.Col(dcc.Graph(id='intensity-graph'), width=12),
    ]),
    dbc.Row([
        dbc.Col(dcc.Graph(id='box-plot'), width=12),  # Box plot for sentiment scores
    ]),
    dbc.Row([
        dbc.Col(dcc.Graph(id='emotion-graph'), width=12),  # Emotion frequency graph
    ]),
    dbc.Row([
        dbc.Col(dash_table.DataTable(id='comments-table',
                                     columns=[
                                         {'name': 'Comment', 'id': 'comment'},
                                         {'name': 'Sentiment', 'id': 'comment_sentiment_category'},
                                         {'name': 'Intensity', 'id': 'comment_intensity'},
                                         {'name': 'Emotion', 'id': 'comment_emotion'}  # Emotion column
                                     ],
                                     style_cell={'textAlign': 'left'},
                                     style_table={'overflowX': 'auto'},
                                     style_data={'whiteSpace': 'normal', 'height': 'auto'},
                                     page_size=10
                                     ),
                width=12)
    ])
], fluid=True)

# Text Preprocessing Function
def preprocess_text(text):
    """
    Preprocesses the text by converting to lowercase, removing non-alphabet characters,
    tokenizing, and removing stopwords.
    """
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Intensity Categorization Function
def categorize_intensity(score):
    """
    Categorizes sentiment intensity as 'Strong', 'Moderate', or 'Mild'
    based on the sentiment score.
    """
    if score >= 0.7:
        return 'Strong'
    elif score >= 0.4:
        return 'Moderate'
    else:
        return 'Mild'

# Function to Fetch Reddit Posts Based on Keyword
def fetch_reddit_posts(keyword):
    """
    Fetches Reddit posts from the 'news' subreddit based on the given keyword.
    Filters posts with a similarity ratio greater than 0.3.
    """
    subreddit = reddit.subreddit("news")
    posts = []

    # Function to calculate similarity between two strings
    def similar(a, b):
        return SequenceMatcher(None, a, b).ratio()

    for submission in subreddit.search(keyword, sort="new", time_filter="all"):
        similarity_ratio = similar(keyword.lower(), submission.title.lower())
        if similarity_ratio > 0.2:
            submission.comments.replace_more(limit=50)
            for comment in submission.comments.list():
                posts.append({
                    "title": submission.title,
                    "comment": comment.body,
                    "time": datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M')
                })
            break

    return posts

# Callback Function to Update Graphs and Table Based on Keyword Input
@app.callback(
    [Output('sentiment-graph', 'figure'),
     Output('pie-chart', 'figure'),
     Output('intensity-graph', 'figure'),
     Output('box-plot', 'figure'),
     Output('emotion-graph', 'figure'),  # Output for emotion graph
     Output('comments-table', 'data')],
    [Input('search-button', 'n_clicks')],
    [dash.dependencies.State('keyword-input', 'value')]
)
def update_graph(n_clicks, keyword):
    """
    Updates the sentiment graph, pie chart, intensity graph, box plot, emotion graph,
    and comments table based on the keyword entered by the user.
    """
    if n_clicks > 0 and keyword:
        posts = fetch_reddit_posts(keyword)

        if not posts:
            return go.Figure(), go.Figure(), go.Figure(), go.Figure(), go.Figure(), []

        df = pd.DataFrame(posts)
        df['processed_comment'] = df['comment'].apply(preprocess_text)
        sentiment_results = sentiment_analyzer(df['processed_comment'].tolist())

        df['comment_sentiment'] = [result['label'] for result in sentiment_results]
        df['comment_score'] = [result['score'] for result in sentiment_results]
        df['comment_sentiment_category'] = df['comment_sentiment'].apply(lambda x: 'Positive' if x == 'POSITIVE' else 'Negative')

        df['comment_intensity'] = df['comment_score'].apply(categorize_intensity)

        # Emotion detection for each comment
        emotion_results = emotion_analyzer(df['processed_comment'].tolist())
        df['comment_emotion'] = [max(result, key=lambda x: x['score'])['label'] for result in emotion_results]

        # Sentiment counts based on post title
        sentiment_counts = df.groupby(['title', 'comment_sentiment_category']).size().reset_index(name='comment_count')
        fig_sentiment = go.Figure()
        for sentiment in ['Positive', 'Negative']:
            subset = sentiment_counts[sentiment_counts['comment_sentiment_category'] == sentiment]
            fig_sentiment.add_trace(go.Bar(
                x=subset['title'],
                y=subset['comment_count'],
                name=sentiment
            ))

        fig_sentiment.update_layout(title='Comment Counts Based on Sentiment',
                                    xaxis_title='Post Title',
                                    yaxis_title='Number of Comments',
                                    barmode='stack')

        # Pie chart of sentiment share
        sentiment_share = df['comment_sentiment_category'].value_counts()
        fig_pie = go.Figure(data=[go.Pie(labels=sentiment_share.index, values=sentiment_share.values, hole=0.3)])
        fig_pie.update_layout(title='Share of Comments Based on Sentiment')

        # Intensity counts for positive and negative sentiments
        intensity_counts = df.groupby(['comment_sentiment_category', 'comment_intensity']).size().reset_index(name='count')
        fig_intensity = go.Figure()
        for sentiment in ['Positive', 'Negative']:
            subset = intensity_counts[intensity_counts['comment_sentiment_category'] == sentiment]
            fig_intensity.add_trace(go.Bar(
                x=subset['comment_intensity'],
                y=subset['count'],
                name=sentiment
            ))

        fig_intensity.update_layout(title='Intensity of Comments Based on Sentiment',
                                    xaxis_title='Intensity Level',
                                    yaxis_title='Number of Comments',
                                    barmode='stack')

        # Box plot of sentiment scores
        fig_box = go.Figure()
        fig_box.add_trace(go.Box(
            y=df['comment_score'],
            name='Sentiment Scores',
            boxmean=True,  # Display the mean in the box plot
            marker_color='lightseagreen'
        ))
        fig_box.update_layout(
            title='Box and Whisker Plot of Sentiment Scores',
            yaxis_title='Sentiment Score',
            xaxis_title='Posts',
            showlegend=False
        )

        # Emotion frequency graph
        emotion_counts = df['comment_emotion'].value_counts()
        fig_emotion = go.Figure(data=[go.Bar(x=emotion_counts.index, y=emotion_counts.values)])
        fig_emotion.update_layout(title='Frequency of Emotions in Comments',
                                  xaxis_title='Emotion',
                                  yaxis_title='Count')

        # Data for the comments table
        table_data = df[['comment', 'comment_sentiment_category', 'comment_intensity', 'comment_emotion']].to_dict('records')

        return fig_sentiment, fig_pie, fig_intensity, fig_box, fig_emotion, table_data

    return go.Figure(), go.Figure(), go.Figure(), go.Figure(), go.Figure(), []

# Run the app
if __name__ == '__main__':
    app.run_server(debug=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.

`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



<IPython.core.display.Javascript object>