In [2]:
# --------------------------------------------
# 0. Install Required Packages
# --------------------------------------------
!pip install --quiet praw google-api-python-client openai \
    scikit-learn==1.3.2 matplotlib seaborn \
    sentence-transformers transformers gensim nltk wordcloud textblob

!pip install --upgrade pandas

# --------------------------------------------
# 1. Import Libraries & Mount Google Drive
# --------------------------------------------
from google.colab import drive
import os, re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from wordcloud import WordCloud
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import praw
import gensim
from gensim import corpora

nltk.download('stopwords')
nltk.download('punkt')

drive.mount('/content/drive', force_remount=True)

output_dir = '/content/drive/MyDrive/Colab/DAI_AID/extracted_data/reddit_reviews'
os.makedirs(output_dir, exist_ok=True)

# --------------------------------------------
# 2. Load API Keys
# --------------------------------------------
reddit_keys = {}
with open('/content/drive/MyDrive/Colab/DAI_AID/keys/reddit_keys.txt') as f:
    for line in f:
        if '=' in line:
            key, value = line.strip().split('=', 1)
            reddit_keys[key] = value

# --------------------------------------------
# 3. Search & Collect Comments
# --------------------------------------------
reddit = praw.Reddit(
    client_id=reddit_keys['client_id'],
    client_secret=reddit_keys['client_secret'],
    user_agent=reddit_keys['user_agent'],
    check_for_async=False
)

search_term = "apple airpods max reviews"
subreddit = reddit.subreddit('all')
search_results = subreddit.search(search_term, limit=10)

all_data = []
def extract_comment_thread(comment):
    return {
        'body': comment.body,
        'author': str(comment.author),
        'replies': [extract_comment_thread(reply) for reply in comment.replies if hasattr(reply, "body")]
    }

for post in search_results:
    print(f"Fetching: {post.title}")
    comments = []
    try:
        post.comments.replace_more(limit=None)
        for top_comment in post.comments:
            if hasattr(top_comment, "body"):
                comments.append(extract_comment_thread(top_comment))
    except Exception as e:
        print(f"Error: {e}")
    all_data.append({
        'title': post.title,
        'url': f"https://www.reddit.com{post.permalink}",
        'post_id': post.id,
        'comments': comments
    })

print(f"\nDone. {len(all_data)} Reddit posts scraped.")

# --------------------------------------------
# 4. Clean & Process Comments (Thread Flattening)
# --------------------------------------------
def flatten_comments(comments, post_id, post_title, parent_id=None, depth=0):
    rows = []
    for comment in comments:
        row = {
            'post_id': post_id,
            'post_title': post_title,
            'comment_author': comment.get('author'),
            'comment_body': comment.get('body'),
            'parent_comment_id': parent_id,
        }
        rows.append(row)
        if 'replies' in comment and isinstance(comment['replies'], list):
            rows += flatten_comments(comment['replies'], post_id, post_title, parent_id=comment.get('body')[:50], depth=depth + 1)
    return rows

csv_rows = []
for post in all_data:
    csv_rows += flatten_comments(post['comments'], post['post_id'], post['title'])

df = pd.DataFrame(csv_rows)

# --------------------------------------------
# 5. Sentiment Score + Weight
# --------------------------------------------
def preserve_meaningful_clean(text):
    text = str(text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\[removed\]|\[deleted\]", "", text)
    return text.strip()

def get_sentiment_score(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

df['cleaned'] = df['comment_body'].apply(preserve_meaningful_clean)
df['sentiment'] = df['cleaned'].apply(get_sentiment_score)
df['length'] = df['cleaned'].apply(lambda x: len(x.split()))
df['weight'] = (
    df['sentiment'] * 0.6 +
    (df['length'] / (df['length'].max() + 1)) * 0.4
)

# --------------------------------------------
# 6. Save to Drive
# --------------------------------------------
output_path = f"{output_dir}/reddit_airpodsmax_threaded_comments.csv"
df.to_csv(output_path, index=False)
print(f"Threaded comment data saved to CSV at: {output_path}")




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Mounted at /content/drive
Fetching: One Month with iPhone 16 Pro as a New Apple User – My Honest Review
Fetching: AirPods Pro 2: Long-Term Review 
Fetching: Apple AirPods Max Review: The Audiophile's Perspective
Fetching: Updated AirPods Max Tuning, as measured on Soundguys B&K 5128
Fetching: Sony XM3 vs XM5 vs AirPods Max vs Bose QC ultra headphones
Fetching: Got scammed with Apple AirPods Max on Facebook marketplace
Fetching: Apple AirPods Max vs Sony WH1000XM5: My Review
Fetching: The most honest airpod max v2 review 
Fetching: Apple airpod max review
Fetching: Apple Max V2 headphones from Mike YM

Done. 10 Reddit posts scraped.
Threaded comment data saved to CSV at: /content/drive/MyDrive/Colab/DAI_AID/extracted_data/reddit_reviews/reddit_airpodsmax_threaded_comments.csv
