In [7]:
import os
import requests
import pandas as pd

# Weights for Social Signals rank
SUBMISSION_WEIGHT = 0.35
COMMENT_WEIGHT = 0.65

# For submission data
SUBMISSION_TIME_FILTER = "day"
SUBMISSION_LIMIT = 15

# For comments data
COMMENT_SORT = "top"
COMMENT_LIMIT = 15

# For DB
SCHEMA = "social_signals_dev"
TABLE_NAME = "social_signals_poc"

CLASSIFICATION_THRESHOLD = 0.75
NONE_FILLER = "000"

In [2]:
HUGGINGFACE_TOKEN = os.getenv("huggingface_token")

NER_MODEL_ID = "dslim/bert-large-NER"
EMOTION_MODEL_ID = "j-hartmann/emotion-english-distilroberta-base"
ESG_CATEGORIES_MODEL_ID = "yiyanghkust/finbert-esg-9-categories"


def get_huggingface_response(text, model_id):
    api_url = f"https://api-inference.huggingface.co/models/{model_id}"
    headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}

    payload = {"inputs": text, "options": {"wait_for_model": True}}
    response = requests.post(api_url, headers=headers, json=payload)

    try:
        response = response.json()
    except Exception:
        print(f"Could not get Huggingface response for {model_id}")
        return
    return response


In [3]:
import openai
NONE_FILLER = "000"

openai.api_key = os.getenv("openai_key")


def get_openai_summary(text):
    prompt = f"{text} \n\nTl;dr"
    
    try:
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt,
            temperature=0.7,
            max_tokens=60,
            top_p=1.0,
            frequency_penalty=0.0,
            presence_penalty=1,
        )
    except Exception:
        print("Could not get OpenAI response")
        return NONE_FILLER

    return response["choices"][0]["text"]


In [4]:
import praw

REDDIT_ID = os.getenv("reddit_id")
print(REDDIT_ID)
REDDIT_SECRET = os.getenv("reddit_secret")
REDDIT_USERNAME = os.getenv("reddit_username")
REDDIT_PASSWORD = os.getenv("reddit_password")


def get_reddit():
    reddit = praw.Reddit(
        user_agent="SocialSignals/1.0",
        client_id=REDDIT_ID,
        client_secret=REDDIT_SECRET,
        username=REDDIT_USERNAME,
        password=REDDIT_PASSWORD,
    )
    return reddit


def get_subreddit(subreddit_name):
    reddit = get_reddit()
    subreddit = reddit.subreddit(subreddit_name)

    return subreddit


def get_top_submissions(subreddit_name, time_filter="day", limit=10):
    """
    time_filter: Can be one of: "all", "day", "hour", "month", "week", or "year"
    """
    subreddit = get_subreddit(subreddit_name)
    top_submissions = subreddit.top(time_filter=time_filter, limit=limit)

    return subreddit, top_submissions


def get_comments(submission, comment_sort="top", comment_limit=10):
    """
    comment_sort: Can be one of: "confidence", "controversial", "new", "old", "q&a", and "top"
    """
    # Calling replace_more() access comments, and so must be done after comment_sort is updated
    submission.comment_sort = comment_sort
    submission.comment_limit = comment_limit

    # Remove comments like "load more comments”, and “continue this thread”
    submission.comments.replace_more(limit=0)

    comments = submission.comments
    return comments


def get_submission_data(subreddit, submission):
    submission_data = {}

    subreddit_name = subreddit.display_name
    submission_data["subreddit_name"] = subreddit_name

    submission_id = submission.id
    submission_data["submission_id"] = submission_id

    submission_title = submission.title
    submission_data["submission_title"] = submission_title
    print(f"Submission title: {submission_title}")

    print("Getting entities for the title...")
    huggingface_entities = get_huggingface_response(submission_title, NER_MODEL_ID)
    organization, person, location = [], [], []
    if isinstance(huggingface_entities, list):
        for entity in huggingface_entities:
            if (
                entity["entity_group"] == "ORG"
                and entity["score"] >= CLASSIFICATION_THRESHOLD
            ):
                organization.append(entity["word"])
            if (
                entity["entity_group"] == "PER"
                and entity["score"] >= CLASSIFICATION_THRESHOLD
            ):
                person.append(entity["word"])
            if (
                entity["entity_group"] == "LOC"
                and entity["score"] >= CLASSIFICATION_THRESHOLD
            ):
                location.append(entity["word"])

    if not (organization + person + location):
        print("Expected entities not found. Quitting...")
        return

    if organization:
        submission_data["organization"] = ", ".join(organization)
    else:
        submission_data["organization"] = NONE_FILLER
    if person:
        submission_data["person"] = ", ".join(person)
    else:
        submission_data["person"] = NONE_FILLER
    if location:
        submission_data["location"] = ", ".join(location)
    else:
        submission_data["location"] = NONE_FILLER

    subreddit_subscribers = subreddit.subscribers
    submission_data["subreddit_subscribers"] = subreddit_subscribers

    submission_score = submission.score
    submission_data["submission_score"] = submission_score

    submission_num_comments = submission.num_comments
    submission_data["submission_num_comments"] = submission_num_comments

    return submission_data


def process_submission_data(
    submission_id, submission_title, comment_sort="top", comment_limit=10
):
    reddit = get_reddit()
    submission = reddit.submission(submission_id)

    assert submission.title == submission_title, "Miss-match in submission title!"

    submission_data = {}

    print("Getting emotion for the title...")
    title_emotion = get_huggingface_response(submission_title, EMOTION_MODEL_ID)
    if isinstance(title_emotion, list):
        title_emotion_prediction = title_emotion[0][0]["label"]
        title_emotion_score = title_emotion[0][0]["score"]
        if title_emotion_score >= CLASSIFICATION_THRESHOLD:
            submission_data["title_emotion"] = title_emotion_prediction
        else:
            submission_data["title_emotion"] = "neutral"

    print("Getting ESG categories for the title...")
    title_esg_categories = get_huggingface_response(
        submission_title, ESG_CATEGORIES_MODEL_ID
    )
    if isinstance(title_esg_categories, list):
        title_esg_categories_prediction = title_esg_categories[0][0]["label"]
        title_esg_categories_score = title_esg_categories[0][0]["score"]
        if title_esg_categories_score:
            submission_data["categories"] = title_esg_categories_prediction

    print("Going over comments...")
    top_level_comments = get_comments(
        submission=submission, comment_sort=comment_sort, comment_limit=comment_limit
    )
    comments_emotion_counter, comments = {}, []
    for top_level_comment in top_level_comments:
        comment = top_level_comment.body
        comments.append(comment)
        print(f"comment: {comment}")

        comment_emotion = get_huggingface_response(comment, EMOTION_MODEL_ID)

        if isinstance(comment_emotion, list):
            comment_emotion_prediction = comment_emotion[0][0]["label"]
            comment_emotion_score = comment_emotion[0][0]["score"]
            comment_emotion_score = round(comment_emotion_score, 2)
            if comment_emotion_score >= CLASSIFICATION_THRESHOLD:
                comments_emotion_counter[comment_emotion_prediction] = (
                    comments_emotion_counter.get(comment_emotion_prediction, 0) + 1
                )
    if comments_emotion_counter:
        submission_data["comments_emotion"] = max(
            comments_emotion_counter, key=comments_emotion_counter.get
        )
    else:
        submission_data["comments_emotion"] = NONE_FILLER

    if comments:
        summary_text = " ".join(comments)
        summary = get_openai_summary(summary_text)

        submission_data["comments_summary"] = summary
    else:
        submission_data["comments_summary"] = NONE_FILLER

    return submission_data


G1dhM7-gSCPe5qQ7fOFSUQ


In [8]:
from datetime import datetime


def get_submission_data2(year, month, day, df, submission_ids, entity, top_n=3):
    print(f"Processing entity {entity}")
    df_entity = df[df[entity] != NONE_FILLER]
    print(f"Shape of the df is {df_entity.shape}")

    submission_data_list = []
    count = 0
    for _, row in df_entity.iterrows():
        submission_id = row["submission_id"]
        submission_title = row["submission_title"]
        submission_data = process_submission_data(
            submission_id=submission_id,
            submission_title=submission_title,
            comment_sort=COMMENT_SORT,
            comment_limit=COMMENT_LIMIT,
        )
        comments_summary = submission_data["comments_summary"]
        if comments_summary == NONE_FILLER:
            continue
        if submission_id in submission_ids:
            continue
        submission_ids.append(submission_id)

        submission_data["bucket"] = entity.capitalize()
        submission_data["year"] = year
        submission_data["day"] = day
        submission_data["month"] = month
        submission_data["title"] = submission_title

        subreddit_name = row["subreddit_name"]
        submission_data["source"] = f"reddit.com/r/{subreddit_name}/{submission_id}"

        entities = row[entity]
        submission_data["tags"] = entities

        submission_data_list.append(submission_data)
        count += 1
        if count == top_n:
            break
    return submission_data_list

year = "2023"
month = "05"
day = "23"

input_path = f"s3://social-signals-dev-data/reddit/year={year}/month={month}/day={day}/combined.csv"
df = pd.read_csv(input_path)
print(f"Shape of the combined df is {df.shape}")

entities = ["location"]
submission_ids = []
for entity in entities:        
    submission_data_list = get_submission_data2(year, month, day, df, submission_ids, entity=entity)

    for submission_data in submission_data_list:
        db_df = pd.DataFrame(data=[submission_data])
        print(f"Writing df of shape {db_df.shape} to the DB")


Shape of the combined df is (103, 12)
Processing entity organization
Shape of the df is (48, 12)
Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...


  warn(


Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
Could not get OpenAI response
Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
Writing df of shape (1, 11) to the DB
Writing df of shape (1, 11) to the DB
Writing df of shape (1, 11) to the DB
Processing entity person
Shape of the df is (68, 12)
Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
Could not get OpenAI response
Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
Could not get OpenAI response
Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...
Could not get OpenAI response
Getting emotion for the title...
Getting 

In [9]:
df

Unnamed: 0,subreddit_name,submission_id,submission_title,organization,person,location,subreddit_subscribers,submission_score,submission_num_comments,submission_rank,comment_rank,social_signals_rank
0,movies,13pugz8,"Hi, I’m Bert Kreischer, stand-up comedian and ...",000,Bert Kreischer,000,30954285,106,579,0.000412,1.000000,0.650144
1,entertainment,13pp69j,Matt Damon Calls Off ‘Oppenheimer’ vs. ‘Barbie...,000,Matt Damon,000,3891380,31397,1317,1.000000,0.007679,0.354992
2,politics,13plzp2,"Burger bar sues Ron DeSantis over ban on ""adul...",000,"Burger, Ron DeSantis",000,8326450,34122,2009,0.507908,0.010779,0.184774
3,politics,13pvmwv,Rick Scott issues travel advisory for ‘sociali...,000,Rick Scott,Florida,8326450,25201,2890,0.375115,0.020995,0.144937
4,politics,13q5bzz,Twitter Is a Far-Right Social Network. It can ...,Twitter,000,000,8326450,22180,2341,0.330146,0.019323,0.128111
...,...,...,...,...,...,...,...,...,...,...,...,...
98,worldnews,13pwd7h,A group of activists said it has mapped more t...,000,000,"Russia, Moscow",31782214,2318,19,0.009028,0.001501,0.004135
99,UpliftingNews,13ppm5p,City of Oakville Canada makes bus rides free f...,City of Oakville Canada,0,000,19039454,378,10,0.002449,0.004843,0.004005
100,sports,13pjz5f,Four held in Spain over Vinicius Jr effigy han...,000,Vinicius Jr,Spain,20640459,69,2,0.000402,0.005307,0.003590
101,UpliftingNews,13pr6zb,How India became a frontrunner in the global r...,000,0,India,19039454,57,1,0.000359,0.003212,0.002213


In [13]:
import pandas as pd
from datetime import datetime

execution_date_str = pd.to_datetime("1490195805", unit='s').strftime(
        "%Y-%m-%d-%H-%M-%S"
)
print(execution_date_str)
given_date = datetime.strptime(execution_date_str, "%Y-%m-%d-%H-%M-%S").date()
print(given_date)
year = given_date.strftime("%Y")
month = given_date.strftime("%m")
day = given_date.strftime("%d")
time = given_date.strftime("%H")
print(year, month, day, time)

2017-03-22-15-17-20
2017-03-22
2017 03 22 00


  execution_date_str = pd.to_datetime("1490195805", unit='s').strftime(
