## SI507 Final Project: Sentiment Analysis on Subreddit r/Obesity Comments

#### Approximate Difficulty
- Web API you haven’t used before that requires OAuth: 6
    - https://www.reddit.com/dev/api/
- Nutrition, Physical Activity, and Obesity - Behavioral Risk Factor Surveillance System Data: 2
    - [Data Page](https://catalog.data.gov/dataset/nutrition-physical-activity-and-obesity-behavioral-risk-factor-surveillance-system)

- Sentiment analysis --> feed data from tree directly to perform analyses
- Networkx => save trees in form of .json data
- [PRAW 7.7.1 Documentation](https://praw.readthedocs.io/en/stable/getting_started/quick_start.html)
- [Working with Reddit Comment Trees](https://www.reddit.com/r/datasets/s/c34hswA2td)
- [Creating Reply Networks from Reddit Comment Threads](https://jrashford.com/2022/01/21/creating-reply-networks-from-reddit-comment-threads/)

In [1]:
# imports
import praw
import networkx as nx
import matplotlib.pyplot as plt
import json
import textwrap
import copy
from datetime import datetime

# go to reddit.com/prefs/apps to edit app information
# client_id: freedom_or_chemicals
# sec_key: SuhVQ8O5zPcQ9qw0BSwGsi5pQn_aBQ
# user_agent/APP Name: VnWlWdqCq0If8VbJAqJRzw

In [None]:
# Initialize the Reddit instance with OAuth2 credentials to access Reddit API

reddit = praw.Reddit(
    client_id="VnWlWdqCq0If8VbJAqJRzw",
    client_secret="SuhVQ8O5zPcQ9qw0BSwGsi5pQn_aBQ",
    user_agent="script for obesity research 1.0",
)

### Scrape Data via Reddit API (don't need to run if JSON files exist)

In [None]:
# Fetch the top n posts from a given subreddit

# top_comment_list = []

# # define target subreddit for scraping
# subreddit = reddit.subreddit("Obesity")

# for submission in subreddit.top(limit=100):
#     # append submission info into list
#     submission_info = {"submission_id": submission.id, "title": submission.title}
#     top_comment_list.append(submission_info)

#     # retrieve any comments that may be collapsed
#     submission.comments.replace_more(limit=None)

#     # append comments to list that will be nested
#     comments_for_submission = []
#     for comment in submission.comments.list():
#         # only comment body
#         # other options
#         # .author
#         # .score - upvotes or downvotes
#         # .parent_id
#         comment_info = {
#             "body": comment.body,
#             "timestamp": comment.created_utc,
#             "parent": comment.parent_id,
#             "comment_id": comment.id
#         }
#         comments_for_submission.append(comment_info)
#     # update last position of the list with nested information
#     top_comment_list[-1]["comments"] = comments_for_submission

# # save scraped contents in json file
# with open("top_comment_data.json", "w") as json_file:
#     json.dump(top_comment_list, json_file, indent=2)

*It seems that there are API call limitations, so set the post limit to 100 for now.*

In [None]:
# # Now fetch the newest n posts from a given subreddit
# newest_comment_list = []

# # define target subreddit for scraping
# subreddit = reddit.subreddit("Obesity")

# for submission in subreddit.new(limit=100):
#     # append submission info into list
#     submission_info = {"submission_id": submission.id, "title": submission.title}
#     newest_comment_list.append(submission_info)

#     # retrieve any comments that may be collapsed
#     submission.comments.replace_more(limit=None)

#     # append comments to list that will be nested
#     comments_for_submission = []
#     for comment in submission.comments.list():
#         # only comment body
#         # other options
#         # .author
#         # .score - upvotes or downvotes
#         # .parent_id
#         comment_info = {
#             "body": comment.body,
#             "timestamp": comment.created_utc,
#             "parent": comment.parent_id,
#         }
#         comments_for_submission.append(comment_info)
#     # update list item with nested information
#     newest_comment_list[-1]["comments"] = comments_for_submission

# # save scraped contents in json file
# with open("new_comment_data.json", "w") as json_file:
#     json.dump(newest_comment_list, json_file, indent=2)

### Create Trees from JSON Data for Each Post

In [None]:
# DON'T TOUCH FOR NOW
def buildCommentHierarchy(submission):
    '''takes the data of a single reddit post and formats it so it can be parsed and plotted.
       This is done by checking the parent ID and comment ID of each comment (or lack thereof).
    '''
    submission_id = submission["submission_id"]
    title = submission["title"]
    comments = submission["comments"]

    # create a dictionary to store comments by their IDs
    comment_dict = {comment["comment_id"]: comment for comment in comments}

    # create a dictionary to store hierarchies (for top-level comments)
    hierarchy = {"submission_id": submission_id, "title": title, "comments": {}}

    # # Iterate over comments to build the hierarchy
    # for comment in comments:
    #     comment_id = comment["comment_id"]
    #     parent_id = comment.get("parent", None)

    #     # If the comment has no parent or its parent is the post itself, it's a top-level comment
    #     if parent_id is None or parent_id == "t3_" + submission_id:
    #         hierarchy["comments"][comment_id] = {"data": comment, "replies": {}}
    #     else:
    #         # If the parent is already in the hierarchy, add the comment as a reply
    #         if parent_id in hierarchy["comments"]:
    #             parent_comment = hierarchy["comments"][parent_id]
    #             # Check if 'replies' key exists, if not, create it
    #             if "replies" not in parent_comment:
    #                 parent_comment["replies"] = {}
    #             parent_comment["replies"][comment_id] = {"data": comment, "replies": {}}
    #         else:
    #             # If the parent is not in the hierarchy, add both parent and comment
    #             parent_id_without_prefix = parent_id.replace("t1_", "")
    #             parent_comment = comment_dict.get(parent_id_without_prefix, None)
    #             if parent_comment is not None:
    #                 hierarchy["comments"][parent_id_without_prefix] = {
    #                     "data": parent_comment,
    #                     "replies": {comment_id: {"data": comment, "replies": {}}},
    #                 }

    return hierarchy

In [2]:
def buildCommentHierarchy(comments):
    tree = []
    di = {}

    # create a deep copy of the comments list
    comments_copy = copy.deepcopy(comments)

    # create a dictionary with key-value pairs using the unique comment IDs as keys
    for comment in comments_copy:
        di[comment['comment_id']] = comment

    for comment in comments_copy:
        comment_id = comment['comment_id']
        parent_id = comment['parent'][3:]  # remove 't3_' from parent ID

        # Check if the comment is a top-level comment
        if parent_id not in di:
            tree.append(di[comment_id])
        else:
            parent = di[parent_id]
            if "comments" not in parent:
                parent["comments"] = []
            parent["comments"].append(di[comment_id])

    return tree

In [3]:
# Load JSON data from file
with open("json_data/top_comment_data.json", "r") as file:
    data = json.load(file)

In [10]:
# replace the 'comments' part of the data with organized trees
for post in data:
    post['comments'] = buildCommentHierarchy(post['comments'])

In [None]:

all_comment_trees = []
for p in data:
    all_comment_trees.append(buildCommentHierarchy(p))

### Plot the Tree Structures with NetworkX

In [None]:
posts_with_comments = [post for post in all_comment_trees if 'comments' in post and post['comments']]

In [None]:
def convert_timestamp(timestamp):
    return datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d')

def splitTitle(title, max_width=75):
    '''breaks up post titles that are too long
    '''
    # Use textwrap to wrap the title at spaces
    wrapped_title = textwrap.fill(title, width=max_width)
    return wrapped_title

In [None]:
def addReplies(G, parent_id, replies_data, parent_timestamp):
    for reply_id, reply_data in replies_data.items():
        label = convert_timestamp(reply_data.get('data', {}).get('timestamp', parent_timestamp))
        G.add_node(reply_id, label=label)
        G.add_edge(parent_id, reply_id)  # Connect parent to replies
        addReplies(G, reply_id, reply_data.get('replies', {}), parent_timestamp)

In [None]:
def plotGraph(submission):
    G = nx.DiGraph()

    submission_id = submission['submission_id']
    title = submission['title']
    comments = submission['comments']

    for comment_id, comment_data in comments.items():
        if comment_id != 'data':
            # label = convert_timestamp(comment_data.get('data', {}).get('timestamp', 0))
            label = comment_id
            G.add_node(comment_id, label=label)
            G.add_edge(submission_id, comment_id)  # Connect submission to comments
            addReplies(G, comment_id, comment_data.get('replies', {}), comment_data.get('data', {}).get('timestamp', 0))

    pos = nx.spring_layout(G)
    labels = nx.get_edge_attributes(G, 'label')
    node_labels = nx.get_node_attributes(G, 'label')

    nx.draw(G, pos, with_labels=True, labels=node_labels, font_weight='bold')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=labels)
    plt.title(f"Post ID: {submission_id}\nPost Title: {splitTitle(title)}")
    plt.show()

In [None]:
for p in posts_with_comments:
    plotGraph(p)

In [None]:
plotGraph(posts_with_comments[2])

In [None]:
# node_colors = ['hotpink' if isinstance(node, str) else 'skyblue' for node in reddit_tree.nodes]