In [1]:
import pandas as pd
import numpy as np
import os
import praw
from dotenv import load_dotenv

In [2]:
df = pd.read_csv('../datasets/all_train.tsv', sep='\t')
df.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,title,upvote_ratio,2_way_label,3_way_label,6_way_label
0,0,0,,,Alexithymia,my walgreens offbrand mucinex was engraved wit...,1551641000.0,i.imgur.com,True,awxhir,https://external-preview.redd.it/WylDbZrnbvZdB...,,2.0,12,mildlyinteresting,My Walgreens offbrand Mucinex was engraved wit...,0.84,1,0,0
1,1,1,155885.0,714550.0,RickSisco,,1443822000.0,,True,cvm5uy4,http://i.imgur.com/yxrkYT8.jpg,3n7fld,,5,psbattle_artwork,,,0,2,4
2,2,2,,,VIDCAs17,this concerned sink with a tiny hat,1534727000.0,i.redd.it,True,98pbid,https://preview.redd.it/wsfx0gp0f5h11.jpg?widt...,,2.0,119,pareidolia,This concerned sink with a tiny hat,0.99,0,2,2
3,3,3,,,prometheus1123,hackers leak emails from uae ambassador to us,1496511000.0,aljazeera.com,True,6f2cy5,https://external-preview.redd.it/6fNhdbc6K1vFA...,,1.0,44,neutralnews,Hackers leak emails from UAE ambassador to US,0.92,1,0,0
4,4,4,282323.0,1228398.0,,,1378792000.0,,True,cc5cbon,http://i.imgur.com/M8KTWMx.jpg,1lz1q0,,3,psbattle_artwork,,,0,2,4


In [3]:
df_filtered = df[df['2_way_label'] == 1]


In [4]:
df=df[['clean_title','id','author','linked_submission_id','subreddit','2_way_label','6_way_label']]

In [5]:
df.dropna(subset=['id'], inplace=True)

In [6]:
print(df['6_way_label'].value_counts())

6_way_label
0    400433
4    242100
2    142939
1     42332
5     26584
3     23830
Name: count, dtype: int64


In [7]:
load_dotenv()

# Get credentials from environment
client_id = os.getenv("REDDIT_CLIENT_ID")
client_secret = os.getenv("REDDIT_CLIENT_SECRET")
user_agent = os.getenv("REDDIT_AGENT")

# Set up Reddit API with PRAW
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

def fetch_submission_content(submission_id):
    try:
        submission = reddit.submission(id=submission_id.replace("t3_", ""))
        content = f"{submission.title}\n\n{submission.selftext}"
        link = f"https://www.reddit.com{submission.permalink}"
        return content, link
    except Exception as e:
        print(f"Error fetching submission {submission_id}: {e}")
        return None, None

In [8]:
# Create a deep copy of the original df before modification
df_nlp = df.copy(deep=True)
df = df[df["linked_submission_id"].isnull()]
df_nlp.dropna(subset=['clean_title'], inplace=True)

In [9]:
import time

start_time = time.time()
max_duration = 5400  # in seconds

for idx, row in df_nlp.iterrows():
    if time.time() - start_time > max_duration:
        print("Time limit reached.")
        break

    submission_id = row["id"]
    content, link = fetch_submission_content(submission_id)
    if content is None:
        print(f"Skipping submission {submission_id} due to error.")
        continue
    if content:
        df_nlp.at[idx, "content"] = content
        df_nlp.at[idx, "link"] = link

Error fetching submission cgp0lmq: received 404 HTTP response
Skipping submission cgp0lmq due to error.
Error fetching submission cocpysr: received 404 HTTP response
Skipping submission cocpysr due to error.
Error fetching submission ctdk4x3: received 404 HTTP response
Skipping submission ctdk4x3 due to error.
Error fetching submission c69yqn0: received 404 HTTP response
Skipping submission c69yqn0 due to error.
Error fetching submission c8ga79q: received 404 HTTP response
Skipping submission c8ga79q due to error.
Error fetching submission chb5foi: received 404 HTTP response
Skipping submission chb5foi due to error.
Error fetching submission cftdth3: received 404 HTTP response
Skipping submission cftdth3 due to error.
Error fetching submission ckbnyyo: received 404 HTTP response
Skipping submission ckbnyyo due to error.
Error fetching submission er6eggt: received 404 HTTP response
Skipping submission er6eggt due to error.
Error fetching submission cf5c7r5: received 404 HTTP response
Sk

In [10]:
df_nlp.dropna(subset=['content'], inplace=True)
len(df_nlp)

6115

In [11]:
df_nlp.to_csv("../datasets/fetched_reddit_content_large.csv", index=False)

In [12]:
df_nlp['2_way_label'].value_counts()

2_way_label
1    3933
0    2182
Name: count, dtype: int64