In [None]:
import snowflake.snowpark.functions as F
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import Session, DataFrame

from snowflake.ml.modeling.preprocessing import StandardScaler
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.metrics import accuracy_score

In [None]:
session = get_active_session()

In [None]:
SELECT * FROM raw_reddit_posts_comments

# goal here is to join only the comments that strictly correspond to a post. And posts that correspond to a comment

In [None]:
# If a post does not exist for a comment row 
# then we don't include this comment. If a 
# comment does not exist since post id does not 
# exist in comments then comment data for that 
# post does not exist 
reddit_posts_comments_joined_df = session.sql("""
    SELECT
        rc.*,
        rp.post_created_at
    FROM raw_reddit_posts_comments rc
    INNER JOIN raw_reddit_posts rp
    ON rc.post_id = rp.post_id 
""")
reddit_posts_comments_joined_df

In [None]:
# getting edited comments 
reddit_posts_comments_joined_df.select("comment_edited_at")\
    .where(F.col("comment_edited_at") < '2882-08-20 08:00:00')

# Feature engineering
## feature engineering could involve the ff:
- tokenizing our words in the comment column
- determining sentiment of the comment using snowflake cortex ai
- using comment_created_at and post_created_at to predict upvotes or using the time elapsed in users comment since the post was posted
e.g. post_created_at - comment_created_at
- other features we can use is length of comment
- if comment is at the level of reply or comment (meaning its a top level reply directly to the post)
- what summary can be made from the comment based on prompt using snowflake cortex ai

this is all to predict upvotes of any given comment