<a href="https://colab.research.google.com/github/27vamsi/Reddit-Data-Scrapper/blob/main/reddit_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install praw

import praw
import pandas as pd
import time
import os
from google.colab import files
import json
import getpass

# Function to scrape posts and comments from a subreddit
def scrape_subreddit(subreddit_name, post_limit=100, sleep_time=1.0):
    """
    Scrapes posts and comments from a specified subreddit

    Args:
        subreddit_name (str): Name of the subreddit to scrape
        post_limit (int): Number of posts to collect
        sleep_time (float): Delay between requests in seconds
    """

    # Get Reddit API credentials securely
    print("Enter your Reddit API credentials:")
    client_id = getpass.getpass("Client ID: ")
    client_secret = getpass.getpass("Client Secret: ")
    username = input("Reddit Username: ")

    # Initialize the Reddit API client
    user_agent = f"script:data_collector:v1.0 (by /u/{username})"
    reddit = praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent
    )

    # Access the subreddit
    subreddit = reddit.subreddit(subreddit_name)

    # Lists to store data
    posts_data = []
    comments_data = []

    # Counter for progress tracking
    count = 0

    # Get top posts from the subreddit
    print(f"Collecting posts from r/{subreddit_name}...")
    for post in subreddit.top(limit=post_limit):
        count += 1
        if count % 10 == 0:
            print(f"Processed {count} posts")

        # Extract post data
        post_data = {
            "post_id": post.id,
            "title": post.title,
            "score": post.score,
            "url": post.url,
            "created_utc": post.created_utc,
            "num_comments": post.num_comments,
            "is_self": post.is_self,
            "selftext": post.selftext if post.is_self else "",
            "permalink": post.permalink
        }

        # Extract image URL if available
        if hasattr(post, 'preview'):
            try:
                post_data["image_url"] = post.preview['images'][0]['source']['url']
            except:
                post_data["image_url"] = ""
        else:
            # Check if the URL ends with an image extension
            image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')
            if post.url.endswith(image_extensions):
                post_data["image_url"] = post.url
            else:
                post_data["image_url"] = ""

        posts_data.append(post_data)

        # Get all comments for this post
        try:
            post.comments.replace_more(limit=None)  # Expand all comment trees
            for comment in post.comments.list():
                comment_data = {
                    "comment_id": comment.id,
                    "post_id": post.id,
                    "parent_id": comment.parent_id,
                    "body": comment.body,
                    "score": comment.score,
                    "created_utc": comment.created_utc,
                    "permalink": comment.permalink
                }
                comments_data.append(comment_data)
        except Exception as e:
            print(f"Error fetching comments for post {post.id}: {e}")

        # Respect Reddit's rate limits with configurable sleep time
        time.sleep(sleep_time)

    # Convert lists to DataFrames
    posts_df = pd.DataFrame(posts_data)
    comments_df = pd.DataFrame(comments_data)

    print(f"Collected {len(posts_data)} posts and {len(comments_data)} comments.")

    return posts_df, comments_df

# Main function to run the scraper
def main():
    """Main function to configure and run the Reddit scraper"""

    # Get user input for scraping parameters
    subreddit_name = input("Enter the subreddit name (without r/): ").strip()

    # Get post limit with validation
    while True:
        try:
            post_limit = int(input("Enter number of posts to collect: "))
            if post_limit > 0:
                break
            else:
                print("Please enter a positive number.")
        except ValueError:
            print("Please enter a valid number.")

    # Get sleep time (optional, with default)
    sleep_input = input("Enter delay between requests in seconds (default 1.0): ").strip()
    if sleep_input:
        try:
            sleep_time = float(sleep_input)
        except ValueError:
            print("Invalid sleep time, using default 1.0 seconds")
            sleep_time = 1.0
    else:
        sleep_time = 1.0

    # Scrape the subreddit
    posts_df, comments_df = scrape_subreddit(subreddit_name, post_limit, sleep_time)

    # Generate timestamp for unique filenames
    timestamp = int(time.time())

    # Create output directory if it doesn't exist
    output_dir = f"reddit_data_{subreddit_name}"
    os.makedirs(output_dir, exist_ok=True)

    # Define file paths
    posts_csv = f"{output_dir}/{subreddit_name}_posts_{timestamp}.csv"
    comments_csv = f"{output_dir}/{subreddit_name}_comments_{timestamp}.csv"
    posts_json = f"{output_dir}/{subreddit_name}_posts_{timestamp}.json"
    comments_json = f"{output_dir}/{subreddit_name}_comments_{timestamp}.json"

    # Save the data
    posts_df.to_csv(posts_csv, index=False)
    comments_df.to_csv(comments_csv, index=False)

    # Also save as JSON for easier processing
    posts_df.to_json(posts_json, orient='records', indent=2)
    comments_df.to_json(comments_json, orient='records', indent=2)

    # Download the files (for Colab)
    try:
        files.download(posts_csv)
        files.download(comments_csv)
        files.download(posts_json)
        files.download(comments_json)
        print("Files downloaded successfully!")
    except:
        print(f"Files saved locally in {output_dir}/")

    # Print summary statistics
    print(f"\nData collection complete!")
    print(f"Posts collected: {len(posts_df)}")
    print(f"Comments collected: {len(comments_df)}")
    print(f"Average comments per post: {len(comments_df)/len(posts_df):.2f}")
    print(f"Posts with images: {sum(1 for url in posts_df['image_url'] if url)}")

# Run the script
if __name__ == "__main__":
    main()

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0
Enter the subreddit name (without r/): fashion
Enter number of posts to collect (max 1000): 100


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Collecting posts from r/fashion...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Processed 10 posts


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Processed 20 posts


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Error fetching comments for post 15nm3td: received 429 HTTP response


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Processed 30 posts


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Processed 40 posts


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Processed 50 posts


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Processed 60 posts


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Processed 70 posts


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Error fetching comments for post 1aeowwg: received 429 HTTP response


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Processed 80 posts


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Processed 90 posts


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Processed 100 posts


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Collected 100 posts and 47375 comments.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Data collection complete! Files saved and downloaded.


In [None]:
# Install required libraries
!pip install asyncpraw aiohttp aiofiles pillow tqdm nest_asyncio

import asyncpraw
import asyncio
import aiohttp
import aiofiles
import pandas as pd
import os
import time
from tqdm import tqdm
import json
from PIL import Image
from io import BytesIO
import re
from google.colab import files
import nest_asyncio
import getpass

# Enable nested event loops (fix for Colab)
nest_asyncio.apply()

# Configuration class for better organization
class ScraperConfig:
    def __init__(self):
        self.image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')
        self.default_extension = '.jpg'
        self.max_file_size = 10 * 1024 * 1024  # 10MB limit
        self.request_delay = 0.5  # Configurable delay between requests
        self.batch_size = 10  # Process in batches

    def get_output_dirs(self, subreddit_name):
        """Generate output directory structure"""
        base_dir = f"reddit_data_{subreddit_name}"
        return {
            'base': base_dir,
            'images': f"{base_dir}/images",
            'processed': f"{base_dir}/processed_data"
        }

# Function to sanitize filenames
def sanitize_filename(filename, max_length=100):
    """
    Clean filename to be filesystem-safe

    Args:
        filename (str): Original filename
        max_length (int): Maximum filename length
    """
    sanitized = re.sub(r'[^\w\-_\. ]', '_', filename)
    return sanitized[:max_length] if len(sanitized) > max_length else sanitized

# Async function to download an image from URL
async def download_image(session, url, post_id, config, index=0):
    """
    Download and save an image from URL

    Args:
        session: aiohttp session
        url (str): Image URL
        post_id (str): Reddit post ID
        config: ScraperConfig instance
        index (int): Image index for multiple images per post
    """
    try:
        async with session.get(url) as response:
            if response.status == 200:
                # Check file size
                content_length = response.headers.get('content-length')
                if content_length and int(content_length) > config.max_file_size:
                    print(f"Skipping large file: {url} ({content_length} bytes)")
                    return None

                # Determine file extension from content type or URL
                content_type = response.headers.get('content-type', '')
                if 'jpeg' in content_type or 'jpg' in content_type:
                    ext = '.jpg'
                elif 'png' in content_type:
                    ext = '.png'
                elif 'gif' in content_type:
                    ext = '.gif'
                elif 'webp' in content_type:
                    ext = '.webp'
                else:
                    # Extract from URL
                    for img_ext in config.image_extensions:
                        if url.lower().endswith(img_ext):
                            ext = img_ext
                            break
                    else:
                        ext = config.default_extension

                # Create safe filename
                safe_post_id = sanitize_filename(post_id)
                filename = f"{config.get_output_dirs('')['images']}/{safe_post_id}_{index}{ext}"

                # Save the image
                data = await response.read()
                async with aiofiles.open(filename, 'wb') as f:
                    await f.write(data)

                # Verify image integrity
                try:
                    img = Image.open(filename)
                    img.verify()
                    return {
                        'local_path': filename,
                        'original_url': url,
                        'file_size': len(data),
                        'format': img.format if hasattr(img, 'format') else 'Unknown'
                    }
                except Exception as verify_error:
                    print(f"Image verification failed for {filename}: {verify_error}")
                    if os.path.exists(filename):
                        os.remove(filename)
                    return None

            else:
                print(f"HTTP {response.status} for URL: {url}")
                return None

    except Exception as e:
        print(f"Error downloading image {url}: {e}")
        return None

# Function to extract all image URLs from a post
def extract_image_urls(post):
    """
    Extract all possible image URLs from a Reddit post

    Args:
        post: Reddit post object

    Returns:
        list: List of image URLs
    """
    urls = []

    # Check for preview images
    if hasattr(post, 'preview'):
        try:
            for image in post.preview['images']:
                urls.append(image['source']['url'])
        except Exception:
            pass

    # Check direct image URL
    if hasattr(post, 'url') and post.url:
        config = ScraperConfig()
        if post.url.lower().endswith(config.image_extensions):
            if post.url not in urls:
                urls.append(post.url)

    # Check for gallery posts
    if hasattr(post, 'is_gallery') and post.is_gallery:
        try:
            if hasattr(post, 'gallery_data') and hasattr(post, 'media_metadata'):
                for item in post.gallery_data['items']:
                    media_id = item['media_id']
                    if media_id in post.media_metadata:
                        metadata = post.media_metadata[media_id]
                        if 's' in metadata and 'u' in metadata['s']:
                            url = metadata['s']['u']
                            urls.append(url)
        except Exception:
            pass

    return urls

async def process_post(session, post, config):
    """
    Process a single Reddit post - extract data and download images

    Args:
        session: aiohttp session
        post: Reddit post object
        config: ScraperConfig instance
    """
    post_data = {
        "post_id": post.id,
        "title": post.title,
        "score": post.score,
        "url": post.url,
        "created_utc": post.created_utc,
        "num_comments": post.num_comments,
        "permalink": post.permalink,
        "images": [],
        "comments": []
    }

    # Extract and download all images
    image_urls = extract_image_urls(post)

    if image_urls:
        # Download images concurrently
        image_tasks = []
        for i, url in enumerate(image_urls):
            task = download_image(session, url, post.id, config, i)
            image_tasks.append(task)

        # Wait for all image downloads to complete
        image_results = await asyncio.gather(*image_tasks, return_exceptions=True)

        # Process results
        for result in image_results:
            if isinstance(result, dict):  # Successful download
                post_data["images"].append(result)
            elif isinstance(result, Exception):
                print(f"Image download exception: {result}")

    # Only process posts that have images (as per original logic)
    if not post_data["images"]:
        return None

    # Get comments for this post
    try:
        comments = await post.comments()
        await comments.replace_more(limit=None)
        all_comments = await comments.list()

        for comment in all_comments:
            comment_data = {
                "comment_id": comment.id,
                "parent_id": comment.parent_id,
                "body": comment.body,
                "score": comment.score,
                "created_utc": comment.created_utc
            }
            post_data["comments"].append(comment_data)
    except Exception as e:
        print(f"Error fetching comments for post {post.id}: {e}")

    # Configurable delay between posts
    await asyncio.sleep(config.request_delay)

    return post_data

async def scrape_subreddit_async(subreddit_name, post_limit, config, credentials):
    """
    Main async scraping function

    Args:
        subreddit_name (str): Subreddit to scrape
        post_limit (int): Number of posts to collect
        config: ScraperConfig instance
        credentials (dict): Reddit API credentials
    """

    # Initialize Reddit API client
    reddit = asyncpraw.Reddit(
        client_id=credentials['client_id'],
        client_secret=credentials['client_secret'],
        user_agent=credentials['user_agent']
    )

    subreddit = await reddit.subreddit(subreddit_name)

    # Create output directories
    dirs = config.get_output_dirs(subreddit_name)
    for dir_path in dirs.values():
        os.makedirs(dir_path, exist_ok=True)

    async with aiohttp.ClientSession() as session:
        dataset = []
        print(f"Collecting posts from r/{subreddit_name}...")

        # Collect posts first
        posts = []
        count = 0
        async for post in subreddit.top(limit=post_limit):
            posts.append(post)
            count += 1
            if count >= post_limit:
                break

        print(f"Collected {len(posts)} posts. Processing...")

        # Process posts in batches
        for i in range(0, len(posts), config.batch_size):
            batch = posts[i:i + config.batch_size]
            batch_tasks = [process_post(session, post, config) for post in batch]

            # Process batch
            batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True)

            for result in batch_results:
                if isinstance(result, dict):  # Successful processing
                    dataset.append(result)
                elif isinstance(result, Exception):
                    print(f"Post processing exception: {result}")

            # Progress update
            print(f"Processed batch {i//config.batch_size + 1}/{(len(posts)-1)//config.batch_size + 1}")

            # Batch delay
            if i + config.batch_size < len(posts):
                await asyncio.sleep(config.request_delay * 2)

    await reddit.close()

    print(f"Successfully processed {len(dataset)} posts with images.")
    return dataset

async def main_async():
    """Main async function to run the enhanced scraper"""

    # Get user configuration
    subreddit_name = input("Enter the subreddit name (without r/): ").strip()

    while True:
        try:
            post_limit = int(input("Enter number of posts to collect: "))
            if post_limit > 0:
                break
            else:
                print("Please enter a positive number.")
        except ValueError:
            print("Please enter a valid number.")

    # Get Reddit API credentials securely
    print("\nEnter your Reddit API credentials:")
    client_id = getpass.getpass("Client ID: ")
    client_secret = getpass.getpass("Client Secret: ")
    username = input("Reddit Username: ")

    credentials = {
        'client_id': client_id,
        'client_secret': client_secret,
        'user_agent': f"script:async_data_collector:v1.0 (by /u/{username})"
    }

    # Get optional configuration
    config = ScraperConfig()

    delay_input = input(f"Enter delay between requests in seconds (default {config.request_delay}): ").strip()
    if delay_input:
        try:
            config.request_delay = float(delay_input)
        except ValueError:
            print(f"Invalid delay, using default {config.request_delay} seconds")

    batch_input = input(f"Enter batch size (default {config.batch_size}): ").strip()
    if batch_input:
        try:
            config.batch_size = int(batch_input)
        except ValueError:
            print(f"Invalid batch size, using default {config.batch_size}")

    # Run the scraper
    dataset = await scrape_subreddit_async(subreddit_name, post_limit, config, credentials)

    # Save results
    timestamp = int(time.time())
    dirs = config.get_output_dirs(subreddit_name)
    dataset_filename = f"{dirs['processed']}/{subreddit_name}_dataset_{timestamp}.json"

    with open(dataset_filename, 'w') as f:
        json.dump(dataset, f, indent=2)

    # Calculate statistics
    total_images = sum(len(post["images"]) for post in dataset)
    total_comments = sum(len(post["comments"]) for post in dataset)
    total_size = sum(sum(img.get('file_size', 0) for img in post["images"]) for post in dataset)

    print(f"\n{'='*50}")
    print(f"Data collection complete!")
    print(f"{'='*50}")
    print(f"Posts collected: {len(dataset)}")
    print(f"Images downloaded: {total_images}")
    print(f"Comments collected: {total_comments}")
    print(f"Total download size: {total_size / (1024*1024):.2f} MB")
    print(f"Dataset saved to: {dataset_filename}")
    print(f"Images saved to: {dirs['images']}")

    # Create compressed archive
    archive_name = f"{subreddit_name}_reddit_data_{timestamp}.tar.gz"
    os.system(f'tar -czf {archive_name} {dirs["base"]}')
    print(f"Archive created: {archive_name}")

    # Download for Colab users
    try:
        files.download(archive_name)
        print("Archive downloaded successfully!")
    except:
        print("Archive saved locally.")

# Run the async scraper
if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main_async())

Enter the subreddit name (without r/): fashion
Enter number of posts to collect (max 1000): 10
Collecting posts from r/fashion...
Collected 10 posts. Processing...


  post_data = await process_post(session, post)
  post_data = await process_post(session, post)
100%|██████████| 10/10 [01:05<00:00,  6.51s/it]


Successfully processed 10 posts with images.

Data collection complete!
Collected 10 posts with 67 images and 5928 comments.
Dataset saved to reddit_data/processed_data/fashion_dataset_1745077903.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>