In [1]:
import os
import praw
import requests
import pandas as pd

CLASSIFICATION_THRESHOLD = 0.75

In [2]:
REDDIT_ID = os.getenv("reddit_id")
REDDIT_SECRET = os.getenv("reddit_secret")
REDDIT_USERNAME = os.getenv("reddit_username")
REDDIT_PASSWORD = os.getenv("reddit_password")

def get_reddit():
    reddit = praw.Reddit(
        user_agent="SocialSignals/1.0",
        client_id=REDDIT_ID,
        client_secret=REDDIT_SECRET,
        username=REDDIT_USERNAME,
        password=REDDIT_PASSWORD,
    )
    return reddit

In [3]:
HUGGINGFACE_TOKEN = os.getenv("huggingface_token")

NER_MODEL_ID = "dslim/bert-large-NER"
EMOTION_MODEL_ID = "j-hartmann/emotion-english-distilroberta-base"
ESG_CATEGORIES_MODEL_ID = "yiyanghkust/finbert-esg-9-categories"


def get_huggingface_response(text, model_id):
    api_url = f"https://api-inference.huggingface.co/models/{model_id}"
    headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}

    payload = {"inputs": text, "options": {"wait_for_model": True}}
    response = requests.post(api_url, headers=headers, json=payload)
    
    try:
        response = response.json()
    except Exception:
        print(f"Could not get Huggingface response for {model_id}")
        return 
    return response

In [4]:
import openai
OPENAI_KEY = os.getenv("openai_key")
openai.api_key = OPENAI_KEY
def get_openai_summary(text):
    prompt = f"{text} \n\nTl;dr"

    try:
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt,
            temperature=0.7,
            max_tokens=60,
            top_p=1.0,
            frequency_penalty=0.0,
            presence_penalty=1
            )
    except Exception:
        print("Could not get OpenAI response")
        return 
    return response["choices"][0]["text"]


In [5]:
text = "This is nice"
get_openai_summary(text)

': A nice thing.'

In [6]:
def get_comments(submission, comment_sort="top", comment_limit=10):
    """
    comment_sort: Can be one of: "confidence", "controversial", "new", "old", "q&a", and "top"
    """
    # Calling replace_more() access comments, and so must be done after comment_sort is updated
    submission.comment_sort = comment_sort
    submission.comment_limit = comment_limit

    # Remove comments like "load more comments”, and “continue this thread”
    submission.comments.replace_more(limit=0)

    comments = submission.comments
    return comments


def process_submission_data(
    submission_id, submission_title, comment_sort="top", comment_limit=10
):
    reddit = get_reddit()
    submission = reddit.submission(submission_id)

    assert submission.title == submission_title, "Miss-match in submission title!"

    submission_data = {}

    submission_url = submission.url
    submission_data["url"] = submission_url

    print("Getting emotion for the title...")
    title_emotion = get_huggingface_response(submission_title, EMOTION_MODEL_ID)
    if isinstance(title_emotion, list):
        title_emotion_prediction = title_emotion[0][0]["label"]
        title_emotion_score = title_emotion[0][0]["score"]
        if title_emotion_score >= CLASSIFICATION_THRESHOLD:
            submission_data["title_emotion"] = title_emotion_prediction
        else:
            submission_data["title_emotion"] = "neutral"

    print("Getting ESG categories for the title...")
    title_esg_categories = get_huggingface_response(
        submission_title, ESG_CATEGORIES_MODEL_ID
    )
    if isinstance(title_esg_categories, list):
        title_esg_categories_prediction = title_esg_categories[0][0]["label"]
        title_esg_categories_score = title_esg_categories[0][0]["score"]
        if title_esg_categories_score:
            submission_data["categories"] = title_esg_categories_prediction

    print("Going over comments...")
    top_level_comments = get_comments(
        submission=submission, comment_sort=comment_sort, comment_limit=comment_limit
    )
    comments_emotion_counter, comments = {}, []
    for top_level_comment in top_level_comments:
        comment = top_level_comment.body
        comments.append(comment)

        comment_emotion = get_huggingface_response(comment, EMOTION_MODEL_ID)

        if isinstance(comment_emotion, list):
            comment_emotion_prediction = comment_emotion[0][0]["label"]
            comment_emotion_score = comment_emotion[0][0]["score"]
            comment_emotion_score = round(comment_emotion_score, 2)
            if comment_emotion_score >= CLASSIFICATION_THRESHOLD:
                comments_emotion_counter[comment_emotion_prediction] = (
                    comments_emotion_counter.get(comment_emotion_prediction, 0) + 1
                )
    if comments_emotion_counter:
        submission_data["comments_emotion"] = max(
            comments_emotion_counter, key=comments_emotion_counter.get
        )
    else:
        submission_data["comments_emotion"] = NONE_FILLER

    if comments:
        summary_text = " ".join(comments)
        summary = get_openai_summary(summary_text)

        submission_data["comments_summary"] = summary
    else:
        submission_data["comments_summary"] = NONE_FILLER

    return submission_data

In [7]:
NONE_FILLER = "000"
COMMENT_SORT = "top"
COMMENT_LIMIT = 15

def get_submission_data(year, month, day, df, entity, top_n=3):
    df = df[df[entity] != NONE_FILLER]
    for _, row in df.head(n=top_n).iterrows():
        submission_id = row["submission_id"]
        submission_title = row["submission_title"]
        submission_data = process_submission_data(
            submission_id=submission_id,
            submission_title=submission_title,
            comment_sort=COMMENT_SORT,
            comment_limit=COMMENT_LIMIT,
        )
        submission_data["bucket"] = entity
        submission_data["year"] = year
        submission_data["day"] = day
        submission_data["month"] = month
        submission_data["title"] = submission_title

        subreddit_name = row["subreddit_name"]
        submission_data["source"] = f"reddit.com/r/{subreddit_name}"

        entities = row[entity]
        submission_data["tags"] = entities

        return submission_data

In [8]:
df = pd.read_csv("s3://social-signals-dev-data/reddit/year=2023/month=05/day=17/combined.csv")
df.head()

FileNotFoundError: social-signals-dev-data/reddit/year=2023/month=05/day=17/combined.csv

In [10]:
year = "2023"
month = "05"
day = "17"
submission_data_organization = get_submission_data(year, month, day, df, entity="organization")
submission_data_organization ={
    "url": "None"
}

NameError: name 'df' is not defined

In [20]:
from sqlalchemy import create_engine
TABLE_NAME = "social_signals_pos"
SCHEMA = "social_signals_dev"
IF_EXISTS = "append"

def get_engine():
    username = os.getenv("db_username")
    password = os.getenv("db_password")
    host = os.getenv("db_host")
    port = os.getenv("db_port")
    name = os.getenv("db_name")
    engine = f"mysql+mysqlconnector://{username}:{password}@{host}:{port}/{name}"

    return engine

submission_data_organization ={
    "url": "What"
}
db_df = pd.DataFrame(data=[submission_data_organization])
engine = get_engine()
print(engine)
connection = create_engine(engine, pool_pre_ping=True)

db_df.to_sql(
    TABLE_NAME,
    connection,
    schema=SCHEMA,
    if_exists="append",
    index=False,
    method="multi"
)

mysql+mysqlconnector://admin:g*n8cKB^CA8W#R@social-signals-dev-database.ciikyb1ahzjh.us-east-1.rds.amazonaws.com:3306/social_signals_dev


-1