In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# Load master file
master_df = pd.read_csv("final_toxicity_classified_comments.csv")

In [3]:
# Create subreddit mapping dictionary
subreddit_files = {
    "canada": "comments_per_subreddit/canada_comments.csv",
    "conservative": "comments_per_subreddit/conservative_comments.csv",
    "democrats": "comments_per_subreddit/democrats_comments.csv",
    "politics": "comments_per_subreddit/politics_comments.csv",
    "news": "comments_per_subreddit/news_comments.csv",
    "worldnews": "comments_per_subreddit/worldnews_comments.csv"
}

In [4]:
# Build subreddit lookup table
subreddit_lookup = {}
for subreddit, filepath in subreddit_files.items():
    sub_df = pd.read_csv(filepath)
    subreddit_lookup.update({pid: subreddit for pid in sub_df['post_id'].unique()})

In [5]:
# Map subreddits to master df
master_df['subreddit'] = master_df['post_id'].map(subreddit_lookup)

In [6]:
# Verify
master_df[['post_id', 'subreddit']].drop_duplicates().head()

Unnamed: 0,post_id,subreddit
0,1jod9wq,news
464,1jo04q9,news
1066,1jo7n0i,news
2357,1jog871,canada
2576,1jo46t1,canada


In [14]:
master_df.columns

Index(['user', 'post_id', 'comment', 'upvotes', 'responding_to',
       'responding_to_id', 'comment_id', 'links', 'has_sarcasm', 'has_joke',
       'bert_embed', 'preprocessed_text', 'tfidf_embed', 'Title_x',
       'text_sentiment', 'sentiment_difference', 'entity_sentiments', 'fear',
       'anger', 'anticipation', 'trust', 'surprise', 'positive', 'negative',
       'sadness', 'disgust', 'joy', 'toxic_word_count', 'left_wing',
       'righy_wing', 'Title', 'predictions', 'probabilities',
       'predicted_label', 'subreddit'],
      dtype='object')

In [7]:
master_df.head()

Unnamed: 0,user,post_id,comment,upvotes,responding_to,responding_to_id,comment_id,links,has_sarcasm,has_joke,...,disgust,joy,toxic_word_count,left_wing,righy_wing,Title,predictions,probabilities,predicted_label,subreddit
0,paleo2002,1jod9wq,"Unless they bump that up to 150 billion, this ...",9385,Aggravating_Money992,1jod9wq,mkqwfb9,[],False,False,...,,,0,0,0,SEC continuing $150 million lawsuit against El...,0,0.06,Right,news
1,supercyberlurker,1jod9wq,It's.... depressing.. that I even have to ask ...,1681,Aggravating_Money992,1jod9wq,mkqw7ar,[],False,False,...,0.5,,0,0,0,SEC continuing $150 million lawsuit against El...,0,0.08,Right,news
2,Deranged_Kitsune,1jod9wq,Sounds like doge will be making another round ...,241,Aggravating_Money992,1jod9wq,mkr2i7p,[],False,False,...,0.0,0.0,0,0,0,SEC continuing $150 million lawsuit against El...,0,0.130741,Right,news
3,flyingthroughspace,1jod9wq,Didn't he just pay himself $43 billion to buy ...,682,Aggravating_Money992,1jod9wq,mkqwxan,[],False,False,...,,0.25,0,0,0,SEC continuing $150 million lawsuit against El...,1,0.66,Right,news
4,gucknbuck,1jod9wq,150 million to a billionaire is the equivalent...,198,Aggravating_Money992,1jod9wq,mkr1epy,[],False,False,...,,,0,0,0,SEC continuing $150 million lawsuit against El...,0,0.01,Right,news


In [15]:
master_df['predicted_label'].value_counts()

predicted_label
Right    15603
Name: count, dtype: int64

In [9]:
master_df.shape

(15603, 35)

In [10]:
master_df['subreddit'].isnull().sum()

np.int64(0)

In [11]:
master_df['subreddit'].isna().sum()

np.int64(0)

In [12]:
master_df['subreddit'].value_counts()

subreddit
canada          4665
worldnews       3783
politics        3427
news            3053
democrats        463
conservative     212
Name: count, dtype: int64

In [13]:
master_df.to_csv("complete_final_toxicity_classifier.csv", index=False)