In [1]:
import os
import requests
import pandas as pd

# Weights for Social Signals rank
SUBMISSION_WEIGHT = 0.35
COMMENT_WEIGHT = 0.65

# For submission data
SUBMISSION_TIME_FILTER = "day"
SUBMISSION_LIMIT = 15

# For comments data
COMMENT_SORT = "top"
COMMENT_LIMIT = 15

# For DB
SCHEMA = "social_signals_dev"
TABLE_NAME = "social_signals_poc"

CLASSIFICATION_THRESHOLD = 0.75
NONE_FILLER = "000"

In [2]:
HUGGINGFACE_TOKEN = os.getenv("huggingface_token")

NER_MODEL_ID = "dslim/bert-large-NER"
EMOTION_MODEL_ID = "j-hartmann/emotion-english-distilroberta-base"
ESG_CATEGORIES_MODEL_ID = "yiyanghkust/finbert-esg-9-categories"


def get_huggingface_response(text, model_id):
    api_url = f"https://api-inference.huggingface.co/models/{model_id}"
    headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}

    payload = {"inputs": text, "options": {"wait_for_model": True}}
    response = requests.post(api_url, headers=headers, json=payload)

    try:
        response = response.json()
    except Exception:
        print(f"Could not get Huggingface response for {model_id}")
        return
    return response


In [3]:
import openai
NONE_FILLER = "000"

openai.api_key = os.getenv("openai_key")


def get_openai_summary(text):
    prompt = f"{text} \n\nTl;dr"
    
    try:
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt,
            temperature=0.7,
            max_tokens=60,
            top_p=1.0,
            frequency_penalty=0.0,
            presence_penalty=1,
        )
    except Exception:
        print("Could not get OpenAI response")
        return NONE_FILLER

    return response["choices"][0]["text"]


In [4]:
import praw

REDDIT_ID = os.getenv("reddit_id")
REDDIT_SECRET = os.getenv("reddit_secret")
REDDIT_USERNAME = os.getenv("reddit_username")
REDDIT_PASSWORD = os.getenv("reddit_password")


def get_reddit():
    reddit = praw.Reddit(
        user_agent="SocialSignals/1.0",
        client_id=REDDIT_ID,
        client_secret=REDDIT_SECRET,
        username=REDDIT_USERNAME,
        password=REDDIT_PASSWORD,
    )
    return reddit


def get_subreddit(subreddit_name):
    reddit = get_reddit()
    subreddit = reddit.subreddit(subreddit_name)

    return subreddit


def get_top_submissions(subreddit_name, time_filter="day", limit=10):
    """
    time_filter: Can be one of: "all", "day", "hour", "month", "week", or "year"
    """
    subreddit = get_subreddit(subreddit_name)
    top_submissions = subreddit.top(time_filter=time_filter, limit=limit)

    return subreddit, top_submissions


def get_comments(submission, comment_sort="top", comment_limit=10):
    """
    comment_sort: Can be one of: "confidence", "controversial", "new", "old", "q&a", and "top"
    """
    # Calling replace_more() access comments, and so must be done after comment_sort is updated
    submission.comment_sort = comment_sort
    submission.comment_limit = comment_limit

    # Remove comments like "load more comments”, and “continue this thread”
    submission.comments.replace_more(limit=0)

    comments = submission.comments
    return comments


def get_submission_data(subreddit, submission):
    submission_data = {}

    subreddit_name = subreddit.display_name
    submission_data["subreddit_name"] = subreddit_name

    submission_id = submission.id
    submission_data["submission_id"] = submission_id

    submission_title = submission.title
    submission_data["submission_title"] = submission_title
    print(f"Submission title: {submission_title}")

    print("Getting entities for the title...")
    huggingface_entities = get_huggingface_response(submission_title, NER_MODEL_ID)
    organization, person, location = [], [], []
    if isinstance(huggingface_entities, list):
        for entity in huggingface_entities:
            if (
                entity["entity_group"] == "ORG"
                and entity["score"] >= CLASSIFICATION_THRESHOLD
            ):
                organization.append(entity["word"])
            if (
                entity["entity_group"] == "PER"
                and entity["score"] >= CLASSIFICATION_THRESHOLD
            ):
                person.append(entity["word"])
            if (
                entity["entity_group"] == "LOC"
                and entity["score"] >= CLASSIFICATION_THRESHOLD
            ):
                location.append(entity["word"])

    if not (organization + person + location):
        print("Expected entities not found. Quitting...")
        return

    if organization:
        submission_data["organization"] = ", ".join(organization)
    else:
        submission_data["organization"] = NONE_FILLER
    if person:
        submission_data["person"] = ", ".join(person)
    else:
        submission_data["person"] = NONE_FILLER
    if location:
        submission_data["location"] = ", ".join(location)
    else:
        submission_data["location"] = NONE_FILLER

    subreddit_subscribers = subreddit.subscribers
    submission_data["subreddit_subscribers"] = subreddit_subscribers

    submission_score = submission.score
    submission_data["submission_score"] = submission_score

    submission_num_comments = submission.num_comments
    submission_data["submission_num_comments"] = submission_num_comments

    return submission_data


def process_submission_data(
    submission_id, submission_title, comment_sort="top", comment_limit=10
):
    reddit = get_reddit()
    submission = reddit.submission(submission_id)

    assert submission.title == submission_title, "Miss-match in submission title!"

    submission_data = {}

    print("Getting emotion for the title...")
    title_emotion = get_huggingface_response(submission_title, EMOTION_MODEL_ID)
    if isinstance(title_emotion, list):
        title_emotion_prediction = title_emotion[0][0]["label"]
        title_emotion_score = title_emotion[0][0]["score"]
        if title_emotion_score >= CLASSIFICATION_THRESHOLD:
            submission_data["title_emotion"] = title_emotion_prediction
        else:
            submission_data["title_emotion"] = "neutral"

    print("Getting ESG categories for the title...")
    title_esg_categories = get_huggingface_response(
        submission_title, ESG_CATEGORIES_MODEL_ID
    )
    if isinstance(title_esg_categories, list):
        title_esg_categories_prediction = title_esg_categories[0][0]["label"]
        title_esg_categories_score = title_esg_categories[0][0]["score"]
        if title_esg_categories_score:
            submission_data["categories"] = title_esg_categories_prediction

    print("Going over comments...")
    top_level_comments = get_comments(
        submission=submission, comment_sort=comment_sort, comment_limit=comment_limit
    )
    comments_emotion_counter, comments = {}, []
    for top_level_comment in top_level_comments:
        comment = top_level_comment.body
        comments.append(comment)

        comment_emotion = get_huggingface_response(comment, EMOTION_MODEL_ID)

        if isinstance(comment_emotion, list):
            comment_emotion_prediction = comment_emotion[0][0]["label"]
            comment_emotion_score = comment_emotion[0][0]["score"]
            comment_emotion_score = round(comment_emotion_score, 2)
            
            comments_emotion_counter[comment_emotion_prediction] = (
                comments_emotion_counter.get(comment_emotion_prediction, 0) + 1
            )
    print(comments)
    print(comments_emotion_counter)        
    if comments_emotion_counter:
        submission_data["comments_emotion"] = max(
            comments_emotion_counter, key=comments_emotion_counter.get
        )
    else:
        submission_data["comments_emotion"] = NONE_FILLER

    print(submission_data["comments_emotion"])
    if comments:
        summary_text = " ".join(comments)
        summary = get_openai_summary(summary_text)

        submission_data["comments_summary"] = summary
    else:
        submission_data["comments_summary"] = NONE_FILLER

    return submission_data


In [None]:
from datetime import datetime


def get_submission_data2(year, month, day, df, submission_ids, entity, top_n=3):
    print(f"Processing entity {entity}")
    df_entity = df[df[entity] != NONE_FILLER]
    print(f"Shape of the df is {df_entity.shape}")

    submission_data_list = []
    count = 0
    for _, row in df_entity.iterrows():
        submission_id = row["submission_id"]
        submission_title = row["submission_title"]
        submission_data = process_submission_data(
            submission_id=submission_id,
            submission_title=submission_title,
            comment_sort=COMMENT_SORT,
            comment_limit=COMMENT_LIMIT,
        )
        comments_summary = submission_data["comments_summary"]
        if comments_summary == NONE_FILLER:
            continue
        if submission_id in submission_ids:
            continue
        submission_ids.append(submission_id)

        submission_data["bucket"] = entity.capitalize()
        submission_data["year"] = year
        submission_data["day"] = day
        submission_data["month"] = month
        submission_data["title"] = submission_title

        subreddit_name = row["subreddit_name"]
        submission_data["source"] = f"reddit.com/r/{subreddit_name}/{submission_id}"

        entities = row[entity]
        submission_data["tags"] = entities

        submission_data_list.append(submission_data)
        count += 1
        if count == top_n:
            break
    return submission_data_list

year = "2023"
month = "05"
day = "24"
time = "100000"

input_path = f"s3://social-signals-dev-data/reddit/year={year}/month={month}/day={day}/time={time}/combined.csv"
df = pd.read_csv(input_path)
print(f"Shape of the combined df is {df.shape}")

entities = ["location"]
submission_ids = []
for entity in entities:        
    submission_data_list = get_submission_data2(year, month, day, df, submission_ids, entity=entity)

    for submission_data in submission_data_list:
        db_df = pd.DataFrame(data=[submission_data])
        print(f"Writing df of shape {db_df.shape} to the DB")


Shape of the combined df is (90, 12)
Processing entity location
Shape of the df is (42, 12)
Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...


  warn(


["\nAs a reminder, this subreddit [is for civil discussion.](/r/politics/wiki/index#wiki_be_civil)\n\nIn general, be courteous to others. Debate/discuss/argue the merits of ideas, don't attack people. Personal insults, shill or troll accusations, hate speech, any suggestion or support of harm, violence, or death, and other rule violations can result in a permanent ban. \n\nIf you see comments in violation of our rules, please report them.\n\n For those who have questions regarding any media outlets being posted on this subreddit, please click [here](https://www.reddit.com/r/politics/wiki/approveddomainslist) to review our details as to our approved domains list and outlet criteria.\n \n\n***\n\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/politics) if you have any questions or concerns.*", ">King has previously said the bill will help restore religious liberties “that were lost”\n\nYes. We're going

Could not get OpenAI response
Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
['How many directors get a $200 million dollar budget movie that flops as their first movie?\n\nLike M Night has had his share of flops but his first major movie was Wide Awake, it had a $6 million dollar budget and got mixed reviews. His next movie was Sixth Sense which had a $40 million budget and grossed over $670 million. He then had Signs & Unbreakable. If he started with Lady in the Water which was critically panned and barely even made back its production budget I doubt he gets a second movie.\n\nChris Columbus directed Home Alone as his first major movie (two other smaller budget movies before), he also directed the sequel and Ms. Doubtfire before directing the flop of Bicentennial Man. \n\nThe only one I can think of is David Fincher who directed Alien 3. It was a bad movie and underperformed but still made 3x its production budget. His next movie was S

Could not get OpenAI response
Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
{'anger': 24, 'disgust': 24, 'sadness': 15, 'neutral': 39, 'joy': 1, 'surprise': 16, 'fear': 7}
neutral


Could not get OpenAI response
Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
Could not get Huggingface response for j-hartmann/emotion-english-distilroberta-base
["\nAs a reminder, this subreddit [is for civil discussion.](/r/politics/wiki/index#wiki_be_civil)\n\nIn general, be courteous to others. Debate/discuss/argue the merits of ideas, don't attack people. Personal insults, shill or troll accusations, hate speech, any suggestion or support of harm, violence, or death, and other rule violations can result in a permanent ban. \n\nIf you see comments in violation of our rules, please report them.\n\n For those who have questions regarding any media outlets being posted on this subreddit, please click [here](https://www.reddit.com/r/politics/wiki/approveddomainslist) to review our details as to our approved domains list and outlet criteria.\n \n\n***\n\n\n*I am a bot, and this action was performed automatically. Please [contact the moder

Could not get OpenAI response
Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
['Seems like the movement to appeal to the climate conscience of shareholders is stuck at convincing just 20% of shareholders:\n\n> Shell’s shareholders rejected the resolution by 79.8% to 20.2%, according to a preliminary count from the company. A similar Follow This resolution in 2022 also secured 20% support.', '>A Shell spokesperson said: “We agree that society needs to take action on climate change.”\n\nOh, Shell wants the *society* to take action instead \U0001f979.', 'How is this a choice they have? They should not have a choice to make these decisions that are destroying our fucking habitat.', 'But… but they have an ad running about transitioning to renewable energy! I don’t know what to believe anymore. (/s if that wasn’t obvious)', "When corporations go against embargos, they get sanctions.\n\nWhy are they getting a pass if they go against climate targ

Could not get OpenAI response
Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
["\nAs a reminder, this subreddit [is for civil discussion.](/r/politics/wiki/index#wiki_be_civil)\n\nIn general, be courteous to others. Debate/discuss/argue the merits of ideas, don't attack people. Personal insults, shill or troll accusations, hate speech, any suggestion or support of harm, violence, or death, and other rule violations can result in a permanent ban. \n\nIf you see comments in violation of our rules, please report them.\n\n For those who have questions regarding any media outlets being posted on this subreddit, please click [here](https://www.reddit.com/r/politics/wiki/approveddomainslist) to review our details as to our approved domains list and outlet criteria.\n \n\n***\n\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/politics) if you have any questions 

Could not get OpenAI response
Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
['They are waiting for the 2024 election in the US.', '[deleted]', 'As it should be. Russia must not be given a free pass, or they might try on someone else later. So its in the best interest of world peace that Ukraine’s sovereignty is respected.', 'This war is the single greatest thing to happen to Western defense industries because it’s making everyone take a good, hard look at their defense industry and realizing that war is still a reality that must be faced.', "And I totally accept that. We're very lucky the Ukrainians keep the gates of Europe. They are brave and capable people. \n\nSending them gear is the bare minimum. A lot of the fight is political and financial. We have to help them on all fronts.\n\nThey are paying the highest price. But that fight will be remembered as another 300 moment in history.\n\nSlava Ukraini !", "I sort of understand Russia 

Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
["TDLR: Nvidia doesn't aant to lose that sweet Chinese capital.", 'Translation: don’t secure your long term national interest, it might hurt my profits for awhile', '“The chief executive of Nvidia, the world’s most valuable semiconductor company, has warned that the US tech industry is at risk of “enormous damage” from the escalating battle over chips between Washington and Beijing.\n\nSpeaking to the Financial Times, Jensen Huang said US export controls introduced by the Biden administration to slow Chinese semiconductor manufacturing had left the Silicon Valley group with “our hands tied behind our back” and unable to sell advanced chips in one of the company’s biggest markets.\nAt the same time, he added, Chinese companies were starting to build their own chips to rival Nvidia’s market-leading processors for gaming, graphics and artificial intelligence.\n“If [China] can’t buy from\u2009.\

Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
