In [1]:
import os
import requests
import pandas as pd
import boto3
import requests
import sagemaker
from sagemaker.huggingface import HuggingFacePredictor

# Weights for Social Signals rank
SUBMISSION_WEIGHT = 0.35
COMMENT_WEIGHT = 0.65

# For submission data
SUBMISSION_TIME_FILTER = "day"
SUBMISSION_LIMIT = 15

# For comments data
COMMENT_SORT = "top"
COMMENT_LIMIT = 15

# For DB
SCHEMA = "social_signals_dev"
TABLE_NAME = "social_signals_poc"

CLASSIFICATION_THRESHOLD = 0.75
NONE_FILLER = "000"

In [2]:
ZERO_SHOT_MODEL_ID = "facebook/bart-large-mnli"

REGION_NAME = "us-east-1"
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")


def get_sagemaker_response(payload, endpoint_name):
    sagemaker_session = sagemaker.Session(
        boto3.session.Session(
            region_name=REGION_NAME,
            aws_access_key_id=AWS_ACCESS_KEY_ID,
            aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        )
    )

    predictor = HuggingFacePredictor(
        endpoint_name=endpoint_name, sagemaker_session=sagemaker_session
    )

    try:
        output = predictor.predict(payload)
        return output
    except Exception:
        print(
            f"Could not get SageMaker prediction for endpoint {endpoint_name} and payload {payload}"
        )


def get_emotion(text):
    payload = {"inputs": text, "options": {"wait_for_model": True}}
    endpoint_name = "social-signals-emotion-2023-06-03-05-35-02-830"
    output = get_sagemaker_response(payload, endpoint_name)

    return output


def get_ner(text):
    payload = {"inputs": text, "options": {"wait_for_model": True}}
    endpoint_name = "social-signals-ner-2023-06-04-03-05-56-782"
    output = get_sagemaker_response(payload, endpoint_name)

    return output


def get_categories(text):
    payload = {
        "inputs": text,
        "parameters": {"candidate_labels": CATEGORIES},
        "options": {"wait_for_model": True},
    }
    endpoint_name = "social-signals-ner-2023-06-04-03-05-56-782"
    output = get_sagemaker_response(payload, endpoint_name)

    return output


def get_huggingface_zero_shot_classificaiton_response(text):
    api_url = f"https://api-inference.huggingface.co/models/{ZERO_SHOT_MODEL_ID}"
    headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}

    payload = {
        "inputs": text,
        "parameters": {"candidate_labels": CATEGORIES},
        "options": {"wait_for_model": True},
    }
    response = requests.post(api_url, headers=headers, json=payload)

    if response.ok:
        response = response.json()
        return response
    else:
        print(f"Could not get Huggingface OK response for {ZERO_SHOT_MODEL_ID}")
        print(f"Response was {response}")
        return


In [3]:
import openai
NONE_FILLER = "000"

openai.api_key = os.getenv("openai_key")


def get_openai_summary(text):
    prompt = f"Summarize the following text in one sentence:\n\n{text}"
    
    try:
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt,
            temperature=0.7,
            max_tokens=60,
            top_p=1.0,
            frequency_penalty=0.0,
            presence_penalty=1,
        )
    except Exception:
        print("Could not get OpenAI response")
        return NONE_FILLER

    return response["choices"][0]["text"]

In [4]:
import praw
from praw.models import MoreComments

REDDIT_ID = os.getenv("reddit_id")
REDDIT_SECRET = os.getenv("reddit_secret")
REDDIT_USERNAME = os.getenv("reddit_username")
REDDIT_PASSWORD = os.getenv("reddit_password")


def get_reddit():
    reddit = praw.Reddit(
        user_agent="SocialSignals/1.0",
        client_id=REDDIT_ID,
        client_secret=REDDIT_SECRET,
        username=REDDIT_USERNAME,
        password=REDDIT_PASSWORD,
    )
    return reddit


def get_subreddit(subreddit_name):
    reddit = get_reddit()
    subreddit = reddit.subreddit(subreddit_name)

    return subreddit


def get_top_submissions(subreddit_name, time_filter="day"):
    """
    time_filter: Can be one of: "all", "day", "hour", "month", "week", or "year"
    """
    subreddit = get_subreddit(subreddit_name)
    top_submissions = subreddit.top(time_filter=time_filter)

    return subreddit, top_submissions


def get_comments(submission, comment_sort="top"):
    """
    comment_sort: Can be one of: "confidence", "controversial", "new", "old", "q&a", and "top"
    """
    # Calling replace_more() access comments, and so must be done after comment_sort is updated
    submission.comment_sort = comment_sort

    # Remove comments like "load more comments”, and “continue this thread”
    submission.comments.replace_more(limit=0)

    comments = submission.comments
    return comments


def get_submission_data(subreddit, submission):
    submission_data = {}

    subreddit_name = subreddit.display_name
    submission_data["subreddit_name"] = subreddit_name

    submission_id = submission.id
    submission_data["submission_id"] = submission_id

    submission_title = submission.title
    submission_data["submission_title"] = submission_title
    print(f"Submission title: {submission_title}")

    print("Getting entities for the title...")
    entities = get_ner(submission_title)
    organization, person, location = [], [], []
    if isinstance(entities, list):
        for entity in entities:
            if (
                entity["entity"] == "B-ORG" and entity["score"] >= CLASSIFICATION_THRESHOLD
            ):
                organization.append(entity["word"])
            if (
                entity["entity"] == "B-PER" and entity["score"] >= CLASSIFICATION_THRESHOLD
            ):
                person.append(entity["word"])
            if (
                entity["entity"] == "B-LOC" and entity["score"] >= CLASSIFICATION_THRESHOLD
            ):
                location.append(entity["word"])

    if not (organization + person + location):
        print("Expected entities not found. Quitting...")
        return

    if organization:
        submission_data["Organization"] = ", ".join(organization)
    else:
        submission_data["Organization"] = NONE_FILLER
    if person:
        submission_data["Person"] = ", ".join(person)
    else:
        submission_data["Person"] = NONE_FILLER
    if location:
        submission_data["Location"] = ", ".join(location)
    else:
        submission_data["Location"] = NONE_FILLER

    print("Getting categories for the title...")
    huggingface_zero_shot_classificaiton = (
        get_huggingface_zero_shot_classificaiton_response(submission_title)
    )
    if isinstance(huggingface_zero_shot_classificaiton, dict):
        title_categories_label = huggingface_zero_shot_classificaiton["labels"][0]
        title_categories_score = huggingface_zero_shot_classificaiton["scores"][0]

        if title_categories_score >= 0.25:
            submission_data["categories"] = title_categories_label
        else:
            submission_data["categories"] = "Miscellaneous"
    else:
        submission_data["categories"] = "Miscellaneous"

    subreddit_subscribers = subreddit.subscribers
    submission_data["subreddit_subscribers"] = subreddit_subscribers

    submission_score = submission.score
    submission_data["submission_score"] = submission_score

    submission_num_comments = submission.num_comments
    submission_data["submission_num_comments"] = submission_num_comments

    return submission_data


def process_submission_data(
    submission_id, submission_title, comment_sort="top", comment_limit=10
):
    reddit = get_reddit()
    submission = reddit.submission(submission_id)

    assert submission.title == submission_title, "Miss-match in submission title!"

    submission_data = {}

    print("Getting emotion for the title...")
    title_emotion = get_emotion(submission_title)
    if isinstance(title_emotion, list):
        if title_emotion:
            title_emotion = title_emotion[0]
            title_emotion_prediction = title_emotion["label"]
            title_emotion_score = title_emotion["score"]
            if title_emotion_score >= CLASSIFICATION_THRESHOLD:
                submission_data["title_emotion"] = title_emotion_prediction
            else:
                submission_data["title_emotion"] = "neutral"

    print("Going over comments...")
    comment_count = 0
    top_level_comments = get_comments(submission=submission, comment_sort=comment_sort)
    comments_emotion_counter, comments = {}, []
    for top_level_comment in top_level_comments:
        if isinstance(top_level_comment, MoreComments):
            continue

        # We don't want stickied comments- mostly from Mods
        if top_level_comment.stickied:
            print("Found stickied comment; skipping...")
            continue

        # We don't want comments from bots
        comment_author = top_level_comment.author
        if comment_author:
            if "bot" in comment_author.name.lower():
                print(f"Found comment from bot {comment_author}; skipping...")
                continue

        comment = top_level_comment.body
        comment_emotion = get_emotion(comment)

        if isinstance(comment_emotion, list):
            if comment_emotion:
                comment_emotion = comment_emotion[0]
                comment_emotion_prediction = comment_emotion["label"]

                # Neutral is abundant and not interesting
                if comment_emotion_prediction == "neutral":
                    continue

                comment_emotion_score = comment_emotion["score"]
                comment_emotion_score = round(comment_emotion_score, 2)
                if comment_emotion_score < CLASSIFICATION_THRESHOLD:
                    continue

                comments_emotion_counter[comment_emotion_prediction] = (
                    comments_emotion_counter.get(comment_emotion_prediction, 0) + 1
                )

        comments.append(comment)
        comment_count += 1
        if comment_count == comment_limit:
            break
    print(f"Found {len(comments)} comments")

    if comments_emotion_counter:
        submission_data["comments_emotion"] = max(
            comments_emotion_counter, key=comments_emotion_counter.get
        )
    else:
        submission_data["comments_emotion"] = "neutral"

    if comments:
        summary_text = " ".join(comments)
        summary = get_openai_summary(summary_text)

        submission_data["comments_summary"] = summary
    else:
        submission_data["comments_summary"] = NONE_FILLER

    return submission_data

In [10]:
from datetime import datetime


def filter_submission_data(
    year, month, day, time, df, submission_ids, entity, category, top_n=3
):
    print(f"Processing entity {entity} and category {category}")

    df = df[(df[entity] != NONE_FILLER) & (df["categories"] == category)]
    print(f"Shape of the df is {df.shape}")

    submission_data_list = []
    count = 0
    for _, row in df.iterrows():
        submission_id = row["submission_id"]
        if submission_id in submission_ids:
            continue

        submission_title = row["submission_title"]
        submission_data = process_submission_data(
            submission_id=submission_id,
            submission_title=submission_title,
            comment_sort=COMMENT_SORT,
            comment_limit=COMMENT_LIMIT,
        )

        comments_summary = submission_data["comments_summary"]
        if comments_summary == NONE_FILLER:
            continue
        
        submission_ids.append(submission_id)

        submission_data["bucket"] = entity
        submission_data["year"] = year
        submission_data["month"] = month
        submission_data["day"] = day
        submission_data["time"] = time
        submission_data["title"] = submission_title
        submission_data["social_signals_rank"] = row["social_signals_rank"]

        subreddit_name = row["subreddit_name"]
        submission_data[
            "source"
        ] = f"https://reddit.com/r/{subreddit_name}/{submission_id}"

        entities = row[entity]
        submission_data["tags"] = entities

        submission_data_list.append(submission_data)
        count += 1
        if count == top_n:
            break
    return submission_data_list

year = "2023"
month = "06"
day = "03"
time = "220000"

input_path = f"s3://social-signals-dev-data/reddit/year={year}/month={month}/day={day}/time={time}/combined.csv"
df = pd.read_csv(input_path)
print(df.head())
print(f"Shape of the combined df is {df.shape}")

entities = ["Location"]
submission_ids = []
for entity in entities:
    for category in ["Politics"]:
        submission_data_list = filter_submission_data(
                        year,
                        month,
                        day,
                        time,
                        df,
                        submission_ids,
                        entity=entity,
                        category=category,
                    )
        for submission_data in submission_data_list:
            print(submission_data)

  subreddit_name submission_id   
0         movies       13zn4yg  \
1     television       13zrexg   
2       politics       13zae3e   
3         movies       13zlw1c   
4       politics       13zjuxa   

                                    submission_title Organization   Person   
0                  What's your top 3 Stanley Kubrick            0  Stanley  \
1  Who are TV actors/actresses that you think are...            0        0   
2  Federal Judge rules Tennessee drag ban is unco...            0        0   
3             Gen X equivalent of Pacino and DeNiro?            0  Pac, De   
4  Florida congressman shouts ‘f*** Ron DeSantis’...            0      Ron   

    Location     categories  subreddit_subscribers  submission_score   
0          0  Entertainment               30997126                 4  \
1  Hollywood  Entertainment               16893476                 5   
2  Tennessee         Health                8331149             53193   
3          0  Entertainment           

  warn(


KeyboardInterrupt: 