In [2]:
import pandas as pd

In [17]:
# Reading the dataframes from CSV files
comments_df = pd.read_csv("data/final_comments_dataframe.csv")
posts_df = pd.read_csv("data/final_posts_dataframe.csv")

# Selecting the relevant columns and renaming them for consistency
comments_df = comments_df[["user", "comment", "created_utc", "score"]]
posts_df = posts_df[["author", "text", "created_utc", "score"]]

# Renaming columns in posts_df to match comments_df
posts_df.columns = ["user", "comment", "created_utc", "score"]

# Dropping duplicates
comments_df.drop_duplicates(inplace=True)
posts_df.drop_duplicates(inplace=True)

# Dropping rows with missing values in the comment columns
comments_df.dropna(subset=["comment"], inplace=True)
posts_df.dropna(subset=["comment"], inplace=True)

# Converting 'created_utc' to datetime and extracting year, month, and day
comments_df['created_utc'] = pd.to_datetime(comments_df['created_utc']).dt.date
posts_df['created_utc'] = pd.to_datetime(posts_df['created_utc']).dt.date

# Concatenating the dataframes into one
combined_df = pd.concat([comments_df, posts_df], ignore_index=True)

# Display the combined dataframe
display(combined_df.head())
combined_df.shape

Unnamed: 0,user,comment,created_utc,score
0,gloomygl,hi kenny call cause tight like rather curious ...,2024-05-18,543
1,gloomygl,came,2024-05-13,40
2,gloomygl,theyre good tho,2024-05-17,56
3,gloomygl,pimp butterfly,2024-05-21,371
4,gloomygl,turn boing,2024-05-10,43


(51474, 4)

In [24]:
descriptive_comments_stats = combined_df["comment"].apply(lambda x: len(x.split())).describe(percentiles=[.01, .1, .25, .5, .75, .90, .95, .99])
descriptive_comments_stats

count    51474.000000
mean        14.933034
std         28.082863
min          1.000000
1%           1.000000
10%          2.000000
25%          4.000000
50%          8.000000
75%         16.000000
90%         32.000000
95%         48.000000
99%        105.000000
max       2165.000000
Name: comment, dtype: float64

In [21]:
comments_score = comments_df["score"]
descriptive_score_stats = comments_df.describe(percentiles=[.01, .1, .25, .5, .75, .90, .95, .99])
descriptive_score_stats

Unnamed: 0,score
count,49908.0
mean,60.415945
std,356.143695
min,-320.0
1%,-10.0
10%,1.0
25%,1.0
50%,2.0
75%,11.0
90%,74.0


In [27]:
filtered_combined_df = combined_df[combined_df["comment"].apply(lambda x: len(x.split()) >= 4 ) & comments_df["score"] >= 1]
# Saving the filtered dataframe to a CSV file
filtered_combined_df.to_csv("data/filtered_comments.csv", index=False)

  filtered_combined_df = combined_df[combined_df["comment"].apply(lambda x: len(x.split()) >= 4 ) & comments_df["score"] >= 1]


In [38]:
filtered_combined_df.head()

Unnamed: 0,user,comment,created_utc,score
0,gloomygl,hi kenny call cause tight like rather curious ...,2024-05-18,543
5,gloomygl,might want call apple music culture feel like ...,2024-05-22,133
7,gloomygl,hiding word lmao know youve screaming certifie...,2024-05-05,6
8,gloomygl,fan real loser kendrick dropping track seven d...,2024-05-15,899
11,gloomygl,two sister piece shit even hit one even there ...,2024-05-05,1


In [44]:
from sentiment_analysis import group_comments
grouped_data=group_comments(data_path="data/filtered_comments.csv", data_stop="2024-03-22")

users = grouped_data["user"]

{user:comments for user, comments in zip(grouped_data["user"], grouped_data["comment"])}

{'00rgus': ['think could happen near future yes number wise right longer number one still definitely top biggest genre world'],
 '07bot4life': ['drake mj kendrick prince j cole'],
 '0Stillmatic0': ['holyyy shit since ive jumped surprise feature like'],
 '1058pm': ['real dont think kendrick level drake take gkmc loses half fan base right love though cant wait guy energize'],
 '15L_Poo_': ['verse trash kendrick got ta come harder supposed dis'],
 '22Two_s': ['gen z dont wake going end popular poopy flavored lollipop'],
 '3xoticP3nguin': ['immature genre idk get lol'],
 '404__LostAngeles': ['one favorite album kendrick lamars good kid maad city id highly recommend listening start finish tell cohesive story'],
 '53Hump': ['well eminem sure favor try tech nne strange muzik record label tech eminem great lyricist imo technnes music bit rhythmic gregorian chant vibe whole catalog appeal like rapper plus hand best live performer stage hip hop community last couple decade'],
 '99probs-allbitche

In [37]:
filtered_combined_df["created_utc"].min(), filtered_combined_df["created_utc"].max()

(datetime.date(2024, 3, 22), datetime.date(2024, 5, 29))