In [2]:
import pandas as pd

In [17]:
# Reading the dataframes from CSV files
comments_df = pd.read_csv("final_comments_dataframe.csv")
posts_df = pd.read_csv("final_posts_dataframe.csv")

# Selecting the relevant columns and renaming them for consistency
comments_df = comments_df[["user", "comment", "created_utc", "score"]]
posts_df = posts_df[["author", "text", "created_utc", "score"]]

# Renaming columns in posts_df to match comments_df
posts_df.columns = ["user", "comment", "created_utc", "score"]

# Dropping duplicates
comments_df.drop_duplicates(inplace=True)
posts_df.drop_duplicates(inplace=True)

# Dropping rows with missing values in the comment columns
comments_df.dropna(subset=["comment"], inplace=True)
posts_df.dropna(subset=["comment"], inplace=True)

# Converting 'created_utc' to datetime and extracting year, month, and day
comments_df['created_utc'] = pd.to_datetime(comments_df['created_utc']).dt.date
posts_df['created_utc'] = pd.to_datetime(posts_df['created_utc']).dt.date

# Concatenating the dataframes into one
combined_df = pd.concat([comments_df, posts_df], ignore_index=True)

# Display the combined dataframe
display(combined_df.head())
combined_df.shape

Unnamed: 0,user,comment,created_utc,score
0,gloomygl,hi kenny call cause tight like rather curious ...,2024-05-18,543
1,gloomygl,came,2024-05-13,40
2,gloomygl,theyre good tho,2024-05-17,56
3,gloomygl,pimp butterfly,2024-05-21,371
4,gloomygl,turn boing,2024-05-10,43


(51474, 4)

In [24]:
descriptive_comments_stats = combined_df["comment"].apply(lambda x: len(x.split())).describe(percentiles=[.01, .1, .25, .5, .75, .90, .95, .99])
descriptive_comments_stats

count    51474.000000
mean        14.933034
std         28.082863
min          1.000000
1%           1.000000
10%          2.000000
25%          4.000000
50%          8.000000
75%         16.000000
90%         32.000000
95%         48.000000
99%        105.000000
max       2165.000000
Name: comment, dtype: float64

In [21]:
comments_score = comments_df["score"]
descriptive_score_stats = comments_df.describe(percentiles=[.01, .1, .25, .5, .75, .90, .95, .99])
descriptive_score_stats

Unnamed: 0,score
count,49908.0
mean,60.415945
std,356.143695
min,-320.0
1%,-10.0
10%,1.0
25%,1.0
50%,2.0
75%,11.0
90%,74.0


In [27]:
filtered_combined_df = combined_df[combined_df["comment"].apply(lambda x: len(x.split()) >= 4 ) & comments_df["score"] >= 1]
# Saving the filtered dataframe to a CSV file
filtered_combined_df.to_csv("filtered_comments.csv", index=False)

  filtered_combined_df = combined_df[combined_df["comment"].apply(lambda x: len(x.split()) >= 4 ) & comments_df["score"] >= 1]
