In [1]:
import os
import praw
import requests
import pandas as pd

CLASSIFICATION_THRESHOLD = 0.75

In [2]:
REDDIT_ID = os.getenv("reddit_id")
REDDIT_SECRET = os.getenv("reddit_secret")
REDDIT_USERNAME = os.getenv("reddit_username")
REDDIT_PASSWORD = os.getenv("reddit_password")

def get_reddit():
    reddit = praw.Reddit(
        user_agent="SocialSignals/1.0",
        client_id=REDDIT_ID,
        client_secret=REDDIT_SECRET,
        username=REDDIT_USERNAME,
        password=REDDIT_PASSWORD,
    )
    return reddit

In [3]:
HUGGINGFACE_TOKEN = os.getenv("huggingface_token")

NER_MODEL_ID = "dslim/bert-large-NER"
EMOTION_MODEL_ID = "j-hartmann/emotion-english-distilroberta-base"
ESG_CATEGORIES_MODEL_ID = "yiyanghkust/finbert-esg-9-categories"


def get_huggingface_response(text, model_id):
    api_url = f"https://api-inference.huggingface.co/models/{model_id}"
    headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}

    payload = {"inputs": text, "options": {"wait_for_model": True}}
    response = requests.post(api_url, headers=headers, json=payload)
    
    try:
        response = response.json()
    except Exception:
        print(f"Could not get Huggingface response for {model_id}")
        return 
    return response

In [11]:
import openai
OPENAI_KEY = os.getenv("openai_key")
openai.api_key = OPENAI_KEY

def get_openai_summary(text):
    prompt = f"{text} \n\nTl;dr"

    try:
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt,
            temperature=0.7,
            max_tokens=60,
            top_p=1.0,
            frequency_penalty=0.0,
            presence_penalty=1
            )
    except Exception:
        print("Could not get OpenAI response")
        return 
    return response["choices"][0]["text"]


In [12]:
text = "This is nice"
get_openai_summary(text)

':\nThanks!'

In [13]:
! pip show openai

Name: openai
Version: 0.27.6
Summary: Python client library for the OpenAI API
Home-page: https://github.com/openai/openai-python
Author: OpenAI
Author-email: support@openai.com
License: 
Location: /opt/anaconda3/envs/social-signals-env/lib/python3.8/site-packages
Requires: aiohttp, requests, tqdm
Required-by: 


In [6]:
def get_comments(submission, comment_sort="top", comment_limit=10):
    """
    comment_sort: Can be one of: "confidence", "controversial", "new", "old", "q&a", and "top"
    """
    # Calling replace_more() access comments, and so must be done after comment_sort is updated
    submission.comment_sort = comment_sort
    submission.comment_limit = comment_limit

    # Remove comments like "load more comments”, and “continue this thread”
    submission.comments.replace_more(limit=0)

    comments = submission.comments
    return comments


def process_submission_data(
    submission_id, submission_title, comment_sort="top", comment_limit=10
):
    reddit = get_reddit()
    submission = reddit.submission(submission_id)

    assert submission.title == submission_title, "Miss-match in submission title!"

    submission_data = {}

    submission_url = submission.url
    submission_data["url"] = submission_url

    print("Getting emotion for the title...")
    title_emotion = get_huggingface_response(submission_title, EMOTION_MODEL_ID)
    if isinstance(title_emotion, list):
        title_emotion_prediction = title_emotion[0][0]["label"]
        title_emotion_score = title_emotion[0][0]["score"]
        if title_emotion_score >= CLASSIFICATION_THRESHOLD:
            submission_data["title_emotion"] = title_emotion_prediction
        else:
            submission_data["title_emotion"] = "neutral"

    print("Getting ESG categories for the title...")
    title_esg_categories = get_huggingface_response(
        submission_title, ESG_CATEGORIES_MODEL_ID
    )
    if isinstance(title_esg_categories, list):
        title_esg_categories_prediction = title_esg_categories[0][0]["label"]
        title_esg_categories_score = title_esg_categories[0][0]["score"]
        if title_esg_categories_score:
            submission_data["categories"] = title_esg_categories_prediction

    print("Going over comments...")
    top_level_comments = get_comments(
        submission=submission, comment_sort=comment_sort, comment_limit=comment_limit
    )
    comments_emotion_counter, comments = {}, []
    for top_level_comment in top_level_comments:
        comment = top_level_comment.body
        comments.append(comment)

        comment_emotion = get_huggingface_response(comment, EMOTION_MODEL_ID)

        if isinstance(comment_emotion, list):
            comment_emotion_prediction = comment_emotion[0][0]["label"]
            comment_emotion_score = comment_emotion[0][0]["score"]
            comment_emotion_score = round(comment_emotion_score, 2)
            if comment_emotion_score >= CLASSIFICATION_THRESHOLD:
                comments_emotion_counter[comment_emotion_prediction] = (
                    comments_emotion_counter.get(comment_emotion_prediction, 0) + 1
                )
    if comments_emotion_counter:
        submission_data["comments_emotion"] = max(
            comments_emotion_counter, key=comments_emotion_counter.get
        )
    else:
        submission_data["comments_emotion"] = NONE_FILLER

    if comments:
        summary_text = " ".join(comments)
        summary = get_openai_summary(summary_text)

        submission_data["comments_summary"] = summary
    else:
        submission_data["comments_summary"] = NONE_FILLER

    return submission_data

In [7]:
NONE_FILLER = "000"
COMMENT_SORT = "top"
COMMENT_LIMIT = 15

def get_submission_data(year, month, day, df, entity, top_n=3):
    df = df[df[entity] != NONE_FILLER]
    for _, row in df.head(n=top_n).iterrows():
        submission_id = row["submission_id"]
        submission_title = row["submission_title"]
        submission_data = process_submission_data(
            submission_id=submission_id,
            submission_title=submission_title,
            comment_sort=COMMENT_SORT,
            comment_limit=COMMENT_LIMIT,
        )
        submission_data["bucket"] = entity
        submission_data["year"] = year
        submission_data["day"] = day
        submission_data["month"] = month
        submission_data["title"] = submission_title

        subreddit_name = row["subreddit_name"]
        submission_data["source"] = f"reddit.com/r/{subreddit_name}"

        entities = row[entity]
        submission_data["tags"] = entities

        return submission_data

In [8]:
df = pd.read_csv("s3://social-signals-dev-data/reddit/year=2023/month=05/day=18/combined.csv")
df.head()

Unnamed: 0,subreddit_name,submission_id,submission_title,organization,person,location,subreddit_subscribers,submission_score,submission_num_comments,submission_rank,comment_rank,social_signals_rank
0,sports,13m0o5c,Kyle Dubas not returning as Maple Leafs genera...,Maple Leafs,Kyle Dubas,000,20640023,1,8,0.0,1.0,0.65
1,politics,13l6lx9,Disney Pulls Plug on $1 Billion Development in...,Disney P,000,Florida,8324813,42321,2423,1.0,0.005554,0.35361
2,politics,13lqabu,Report: The National Archives Is Set to Hand O...,National Archives,"Trump, Trump",White House,8324813,23755,1226,0.561301,0.004848,0.199606
3,news,13l6ykk,"Disney scraps plans for new Florida campus, ma...",Disney,000,Florida,26168448,59786,4206,0.449402,0.007194,0.161967
4,politics,13lc2wb,Ron DiSaster Loses $1 Billion Disney Project D...,000,"Ron DiSaster, Ron DeSantis",000,8324813,16637,1038,0.393109,0.006198,0.141617


In [22]:
year = "2023"
month = "05"
day = "18"
submission_data_organization = get_submission_data(year, month, day, df, entity="organization")
submission_data_organization

Getting emotion for the title...
Getting ESG categories for the title...
Going over comments...


{'url': 'https://www.sportsnet.ca/nhl/article/kyle-dubas-not-returning-as-maple-leafs-general-manager/',
 'title_emotion': 'sadness',
 'categories': 'Corporate Governance',
 'comments_emotion': 'surprise',
 'comments_summary': ' - The Toronto Maple Leafs have “parted ways” with Kyle Dubas, which was unexpected given his success in improving the roster and finding good value players.',
 'bucket': 'organization',
 'year': '2023',
 'day': '18',
 'month': '05',
 'title': 'Kyle Dubas not returning as Maple Leafs general manager',
 'source': 'reddit.com/r/sports',
 'tags': 'Maple Leafs'}

In [35]:
from sqlalchemy import create_engine
import urllib

TABLE_NAME = "social_signals_poc"
SCHEMA = "social_signals_dev"
IF_EXISTS = "append"

def get_engine():
    username = os.getenv("db_username")
    password = urllib.parse.quote(os.getenv("db_password"))
    host = os.getenv("db_host")
    port = os.getenv("db_port")
    name = os.getenv("db_name")
    engine = f"mysql+mysqlconnector://{username}:{password}@{host}:{port}/{name}"

    return engine

db_df = pd.DataFrame(data=[submission_data_organization])
engine = get_engine()
connection = create_engine(engine, pool_pre_ping=True)

db_df.to_sql(
    TABLE_NAME,
    connection,
    schema=SCHEMA,
    if_exists="append",
    index=False,
    method="multi"
)

ProgrammingError: (mysql.connector.errors.ProgrammingError) 1054 (42S22): Unknown column 'title_emotion' in 'field list'
[SQL: INSERT INTO social_signals_dev.social_signals_pos (url, title_emotion, categories, comments_emotion, comments_summary, bucket, year, day, month, title, source, tags) VALUES (%(url_m0)s, %(title_emotion_m0)s, %(categories_m0)s, %(comments_emotion_m0)s, %(comments_summary_m0)s, %(bucket_m0)s, %(year_m0)s, %(day_m0)s, %(month_m0)s, %(title_m0)s, %(source_m0)s, %(tags_m0)s)]
[parameters: {'url_m0': 'https://www.sportsnet.ca/nhl/article/kyle-dubas-not-returning-as-maple-leafs-general-manager/', 'title_emotion_m0': 'sadness', 'categories_m0': 'Corporate Governance', 'comments_emotion_m0': 'surprise', 'comments_summary_m0': ' - The Toronto Maple Leafs have “parted ways” with Kyle Dubas, which was unexpected given his success in improving the roster and finding good value players.', 'bucket_m0': 'organization', 'year_m0': '2023', 'day_m0': '18', 'month_m0': '05', 'title_m0': 'Kyle Dubas not returning as Maple Leafs general manager', 'source_m0': 'reddit.com/r/sports', 'tags_m0': 'Maple Leafs'}]
(Background on this error at: https://sqlalche.me/e/14/f405)