### Scraping data using reddit-api (only posts related to the attributes to begin with)

In [11]:
import praw
import os
from dotenv import load_dotenv
import pandas as pd
from datetime import datetime, timedelta, timezone

# Load .env file for credentials
load_dotenv()

# Reddit API authentication
reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT"),
    username=os.getenv("USERNAMES"),  # Ensure correct key name in your .env file
    password=os.getenv("PASSWORD")
)

# Verify authentication
try:
    print(f"Authenticated as: {reddit.user.me()}")
except Exception as e:
    print(f"Authentication failed: {e}")
    exit()


# Scraping data
all_posts = []
for subreddit_name in subreddits:
    print(f"Scraping subreddit: {subreddit_name}")
    subreddit = reddit.subreddit(subreddit_name)

    for post in subreddit.new(limit=1000):  # Fetch up to 1000 posts per subreddit
        post_date = datetime.fromtimestamp(post.created_utc, tz=timezone.utc)
        if post_date >= start_date:
            all_posts.append({
                "subreddit": subreddit_name,
                "title": post.title,
                "selftext": post.selftext,
                "created_utc": post_date.strftime('%Y-%m-%d %H:%M:%S'),
                "score": post.score,
                "num_comments": post.num_comments,
                "url": post.url
            })

# Convert to DataFrame
df = pd.DataFrame(all_posts)

# Save to CSV
csv_filename = "reddit_posts_last_6_months.csv"
# df.to_csv(csv_filename, index=False)
print(f"Data saved to {csv_filename}")



Authenticated as: Ok-Resist-1662
Scraping subreddit: smartphones
Scraping subreddit: Android
Scraping subreddit: apple
Scraping subreddit: Samsung
Scraping subreddit: GooglePixel
Scraping subreddit: motorola
Scraping subreddit: OnePlus
Scraping subreddit: Xiaomi
Data saved to reddit_posts_last_6_months.csv


In [6]:
df.shape

(7448, 7)

In [13]:
df.to_csv(csv_filename, index=False)

Authenticated as: Ok-Resist-1662


In [3]:
import praw
import os
from dotenv import load_dotenv
import pandas as pd
from datetime import datetime, timedelta, timezone

# Load .env file for credentials
load_dotenv()

# Reddit API authentication
reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT"),
    username=os.getenv("USERNAMES"),  # Ensure correct key name in your .env file
    password=os.getenv("PASSWORD")
)

# Verify authentication
try:
    print(f"Authenticated as: {reddit.user.me()}")
except Exception as e:
    print(f"Authentication failed: {e}")
    exit()


# Define subreddits to scrape
subreddits = ["GooglePixel", "Android", "Smartphones", "TechNews", "Gadgets"]

# Define the time range (e.g., last 6 months)
start_date = datetime.now(timezone.utc) - timedelta(days=1800)

# Scraping data
all_posts = []
for subreddit_name in subreddits:
    print(f"Scraping subreddit: {subreddit_name}")
    subreddit = reddit.subreddit(subreddit_name)

    for post in subreddit.new(limit=1000):  # Fetch up to 1000 posts per subreddit
        post_date = datetime.fromtimestamp(post.created_utc, tz=timezone.utc)
        if post_date >= start_date:
            # Check if the post title or text contains "Pixel 9 Pro"
            if "Pixel 9" in post.title or "Pixel 9 Pro" in post.title or "Pixel 9 Pro" in post.selftext or "Pixel 9" in post.selftext:
                all_posts.append({
                    "subreddit": subreddit_name,
                    "title": post.title,
                    "selftext": post.selftext,
                    "created_utc": post_date.strftime('%Y-%m-%d %H:%M:%S'),
                    "score": post.score,
                    "num_comments": post.num_comments,
                    "url": post.url
                })

# Convert to DataFrame
df = pd.DataFrame(all_posts)

# # Save to CSV
# csv_filename = "reddit_posts_pixel_9_pro_last_6_months.csv"
# df.to_csv(csv_filename, index=False)
# print(f"Data saved to {csv_filename}")

Authenticated as: Ok-Resist-1662
Scraping subreddit: GooglePixel
Scraping subreddit: Android
Scraping subreddit: Smartphones
Scraping subreddit: TechNews
Scraping subreddit: Gadgets


In [6]:
df.shape

(241, 7)

In [22]:
df.shape

(252, 7)

In [7]:
import praw
import os
from dotenv import load_dotenv
import pandas as pd
from datetime import datetime, timedelta, timezone

# Load .env file for credentials
load_dotenv()

# Reddit API authentication
reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT"),
    username=os.getenv("USERNAMES"),  # Ensure correct key name in your .env file
    password=os.getenv("PASSWORD")
)

# Verify authentication
try:
    print(f"Authenticated as: {reddit.user.me()}")
except Exception as e:
    print(f"Authentication failed: {e}")
    exit()

# Define subreddits to scrape
subreddits = ["GooglePixel", "Android", "Smartphones", "TechNews", "Gadgets"]

# Define the time range (e.g., last 6 months)
start_date = datetime.now(timezone.utc) - timedelta(days=180)

# Scraping data
all_posts = []
for subreddit_name in subreddits:
    print(f"Scraping subreddit: {subreddit_name}")
    subreddit = reddit.subreddit(subreddit_name)

    for post in subreddit.new(limit=1000):  # Fetch up to 1000 posts per subreddit
        post_date = datetime.fromtimestamp(post.created_utc, tz=timezone.utc)
        if post_date >= start_date:
            # Check if the post title or text contains "Pixel 9 Pro"
            if "Pixel 9" in post.title or "Pixel 9 Pro" in post.title or "Pixel 9 Pro" in post.selftext or "Pixel 9" in post.selftext:
                all_posts.append({
                    "post_id": post.id,  # Store post ID for comment retrieval
                    "subreddit": subreddit_name,
                    "title": post.title,
                    "selftext": post.selftext,
                    "created_utc": post_date.strftime('%Y-%m-%d %H:%M:%S'),
                    "score": post.score,
                    "num_comments": post.num_comments,
                    "url": post.url
                })

# Convert to DataFrame
df = pd.DataFrame(all_posts)

# Save to CSV
csv_filename = "reddit_posts_pixel_9_pro.csv"
# df.to_csv(csv_filename, index=False)
print(f"Data saved to {csv_filename}")


Authenticated as: Ok-Resist-1662
Scraping subreddit: GooglePixel
Scraping subreddit: Android
Scraping subreddit: Smartphones
Scraping subreddit: TechNews
Scraping subreddit: Gadgets
Data saved to reddit_posts_pixel_9_pro.csv


In [10]:
# Save to CSV
csv_filename = "reddit_posts_pixel_9_pro.csv"
df.to_csv(csv_filename, index=False)
print(f"Data saved to {csv_filename}")

Data saved to reddit_posts_pixel_9_pro.csv


In [8]:
df.head()

Unnamed: 0,post_id,subreddit,title,selftext,created_utc,score,num_comments,url
0,1iksqit,GooglePixel,Continued conversation with Google Assistant n...,"Hi, I'm using Pixel 9 Pro and my problem is th...",2025-02-08 17:36:44,1,0,https://www.reddit.com/r/GooglePixel/comments/...
1,1ikrwa8,GooglePixel,Pixel 9 Pro horizontal green line after screen...,A green line shows up close to the bottom of t...,2025-02-08 17:01:37,3,0,https://www.reddit.com/r/GooglePixel/comments/...
2,1ikr1yi,GooglePixel,Help with Google Photos,Helping my 75 years old father-in-law as he wa...,2025-02-08 16:25:56,2,4,https://www.reddit.com/r/GooglePixel/comments/...
3,1ikpwsz,GooglePixel,My thoughts on the Pixel 9 Pro XL,Warning: negative experience ahead. Fanboys be...,2025-02-08 15:36:00,0,39,https://www.reddit.com/r/GooglePixel/comments/...
4,1ikov5c,GooglePixel,"Hi, I upgraded my Galaxy S21 to a Google Pixel...",On the Samsung phone I could download an app c...,2025-02-08 14:49:25,45,11,https://www.reddit.com/r/GooglePixel/comments/...


In [None]:
import praw
import os
import pandas as pd
from dotenv import load_dotenv

# Load .env file for credentials
load_dotenv()

# Reddit API authentication
reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT"),
    username=os.getenv("USERNAMES"),
    password=os.getenv("PASSWORD")
)

# Load the saved posts CSV
csv_filename = "reddit_posts_pixel_9_pro.csv"
df = pd.read_csv(csv_filename)

# Ensure 'selftext' is treated as a string and replace NaN values
df["selftext"] = df["selftext"].fillna("").astype(str)

# Fetch comments and store each as a separate row
expanded_rows = []
for _, row in df.iterrows():
    post_id = row["post_id"]
    try:
        submission = reddit.submission(id=post_id)
        submission.comments.replace_more(limit=0)  # Load all top-level comments

        # If there are no comments, still keep the post with only selftext
        if len(submission.comments) == 0:
            expanded_rows.append({
                "post_id": row["post_id"],
                "subreddit": row["subreddit"],
                "title": row["title"],
                "text": row["selftext"],  # Only post text (no comments)
                "created_utc": row["created_utc"],
                "score": row["score"],
                "num_comments": row["num_comments"],
                "url": row["url"],
                "comment_id": None,  # No comment
                "comment_text": None  # No comment
            })
        else:
            for comment in submission.comments.list():
                expanded_rows.append({
                    "post_id": row["post_id"],
                    "subreddit": row["subreddit"],
                    "title": row["title"],
                    "text": row["selftext"],  # Keep the original post text
                    "created_utc": row["created_utc"],
                    "score": row["score"],
                    "num_comments": row["num_comments"],
                    "url": row["url"],
                    "comment_id": comment.id,  # Store comment ID
                    "comment_text": comment.body  # Store the comment
                })
    except Exception as e:
        print(f"Failed to fetch comments for post {post_id}: {e}")

# Convert to DataFrame
df_expanded = pd.DataFrame(expanded_rows)




Expanded data saved to reddit_posts_expanded_comments.csv


In [24]:
# Save updated data to CSV
expanded_csv_filename = "reddit_posts_expanded_comments.csv"
df_expanded.to_csv(expanded_csv_filename, index=False)
print(f"Expanded data saved to {expanded_csv_filename}")

Expanded data saved to reddit_posts_expanded_comments.csv


In [25]:
import os
os.getcwd()

'c:\\Users\\hp\\Documents\\GitHub\\LLM-Enhanced-Text-Classification\\Web-scraping-code'