Five Data Sources for the Research Paper
- Reddit
- YouTube Videos
- Google Trending



In [None]:
# YouTube API Data Extraction
import os
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
import json
import pandas as pd

# YouTube API key (replace with your key)
api_key = "AIzaSyBLPGus41MZsWV1oagusrYLW__xbMt_3LU"

# Initialize the YouTube API client
youtube = build('youtube', 'v3', developerKey=api_key)

# List of queries to search for
queries = [
    "Rosacea treatment options",
    "Best rosacea cleansers",
    "Best rosacea Moisturizers",
    "Best Rosacea serums",
    "Best Rosacea skin care routines",
    "Supplments for Rosacea",
    "Rosacea skin care routines",
    "How to reduce rosacea redness",
    "Rosacea experience stories",
    "Rosacea triggers and prevention",
    "How to manage rosacea flare-ups",
    "Rosacea and diet",
    "Rosacea laser treatments",
    "Rosacea flushing control",
    "Rosacea natural remedies",
    "How to deal with rosacea in the summer",
    "Rosacea and emotional impact",
    "Rosacea medication reviews",
    "Rosacea and sensitive skin care",
    "How to prevent rosacea from getting worse",
    "Rosacea patient testimonials",
    "Best sunscreen for rosacea"
]

# Function to search for Rosacea-related videos
def search_videos(query, max_results=100):
    request = youtube.search().list(
        part="snippet",
        q=query,
        type="video",
        maxResults=max_results
    )
    response = request.execute()
    return response['items']

# Function to get additional video details like duration, views, likes
def get_video_details(video_id):
    request = youtube.videos().list(
        part="contentDetails,statistics,snippet",
        id=video_id
    )
    response = request.execute()
    video = response['items'][0]

    # Extract video details
    duration = video['contentDetails'].get('duration')
    view_count = video['statistics'].get('viewCount')
    like_count = video['statistics'].get('likeCount')
    comment_count = video['statistics'].get('commentCount')
    tags = video['snippet'].get('tags', [])
    category_id = video['snippet'].get('categoryId')
    license = video['contentDetails'].get('licensedContent', 'Standard YouTube License')
    default_audio_language = video['snippet'].get('defaultAudioLanguage', 'Unknown')

    return {
        "duration": duration,
        "view_count": view_count,
        "like_count": like_count,
        "comment_count": comment_count,
        "tags": tags,
        "category_id": category_id,
        "license": license,
        "default_audio_language": default_audio_language
    }

# Function to retrieve transcript using video_id
def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        transcript_text = " ".join([entry['text'] for entry in transcript])
        return transcript_text
    except Exception as e:
        return f"Transcript not available: {str(e)}"

# Function to store video details and transcripts
def store_data(video_data, query_name):
    # Store as JSON
    json_filename = f"{query_name}.json"
    with open(json_filename, 'w') as f:
        json.dump(video_data, f, indent=4)

    # Store as Excel
    df = pd.DataFrame(video_data)
    excel_filename = f"{query_name}.xlsx"
    df.to_excel(excel_filename, index=False)

# Main function to search videos, retrieve transcripts, and additional details
def main():
    for query in queries:
        print(f"Processing query: {query}")
        videos = search_videos(query)
        video_data = []

        for video in videos:
            video_id = video['id']['videoId']
            title = video['snippet']['title']
            channel_title = video['snippet']['channelTitle']
            description = video['snippet']['description']
            published_at = video['snippet']['publishedAt']
            transcript = get_transcript(video_id)

            # Retrieve additional video details
            video_details = get_video_details(video_id)

            # Append all data
            video_data.append({
                "video_id": video_id,
                "title": title,
                "channel_title": channel_title,
                "description": description,
                "published_at": published_at,
                "transcript": transcript,
                "duration": video_details["duration"],
                "view_count": video_details["view_count"],
                "like_count": video_details["like_count"],
                "comment_count": video_details["comment_count"],
                "tags": video_details["tags"],
                "category_id": video_details["category_id"],
                "license": video_details["license"],
                "default_audio_language": video_details["default_audio_language"]
            })

        # Store data in both JSON and Excel formats, using query name for filenames
        query_name = query.replace(" ", "_").lower()  # Clean query for filename
        store_data(video_data, query_name)

# Run the main function
if __name__ == "__main__":
    main()


In [3]:
# Reddit API for Data
import praw
import csv
from datetime import datetime, timedelta
import time

reddit = praw.Reddit(
    client_id='BIsncD4trTTTt6CDWjtNtw',
    client_secret='vGOeaCiB28hdUpb9sNJZprzUwQ1QzQ',
    user_agent='RosaceaResearchBot/1.0 by u/DaveShevy',
)

def get_rosacea_posts(subreddit_name, end_date, limit=1000):
    subreddit = reddit.subreddit(subreddit_name)
    posts = []

    for post in subreddit.new(limit=None):
        if post.created_utc > end_date.timestamp():
            continue
        if 'rosacea' in post.title.lower() or 'rosacea' in post.selftext.lower():
            posts.append({
                'subreddit': subreddit_name,
                'title': post.title,
                'body': post.selftext,
                'url': f'https://www.reddit.com{post.permalink}',
                'author': str(post.author),
                'score': post.score,
                'upvote_ratio': post.upvote_ratio,
                'num_comments': post.num_comments,
                'created_utc': datetime.fromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                'is_original_content': post.is_original_content,
                'over_18': post.over_18,
                'spoiler': post.spoiler,
                'stickied': post.stickied,
                'comments': get_comments(post, limit=10)
            })
            print(f"Added post: {post.title}")

        if len(posts) >= limit:
            break

        time.sleep(0.1)

    return posts

def get_comments(post, limit=10):
    post.comments.replace_more(limit=0)
    return [{
        'body': comment.body,
        'author': str(comment.author),
        'score': comment.score,
        'created_utc': datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
        'is_submitter': comment.is_submitter
    } for comment in post.comments.list()[:limit]]

def save_to_csv(all_posts, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([
            'Subreddit', 'Post Title', 'Post URL', 'Post Body', 'Post Author', 'Post Score',
            'Upvote Ratio', 'Num Comments', 'Post Created UTC', 'Is Original Content',
            'Over 18', 'Spoiler', 'Stickied', 'Comment Number', 'Comment', 'Comment Author',
            'Comment Score', 'Comment Created UTC', 'Is Submitter'
        ])

        for subreddit, posts in all_posts.items():
            for post in posts:
                for i, comment in enumerate(post['comments'], 1):
                    writer.writerow([
                        post['subreddit'],
                        post['title'],
                        post['url'],
                        post['body'],
                        post['author'],
                        post['score'],
                        post['upvote_ratio'],
                        post['num_comments'],
                        post['created_utc'],
                        post['is_original_content'],
                        post['over_18'],
                        post['spoiler'],
                        post['stickied'],
                        i,
                        comment['body'],
                        comment['author'],
                        comment['score'],
                        comment['created_utc'],
                        comment['is_submitter']
                    ])

def main():
    subreddit_list = ['Rosacea']
    end_date = datetime.now()
    start_date = datetime(2020, 1, 1)

    all_posts = {subreddit: [] for subreddit in subreddit_list}

    for subreddit in subreddit_list:
        print(f"Fetching posts from r/{subreddit} from {start_date.date()} to {end_date.date()}...")
        posts = get_rosacea_posts(subreddit, end_date)
        all_posts[subreddit].extend(posts)
        print(f"Retrieved {len(posts)} posts from r/{subreddit}")

    filename = f'rosacea_posts_{start_date.strftime("%Y%m%d")}_{end_date.strftime("%Y%m%d")}.csv'
    save_to_csv(all_posts, filename)
    print(f"Data saved to {filename}")

    # Print summary
    for subreddit, posts in all_posts.items():
        print(f"\nTotal posts from r/{subreddit}: {len(posts)}")

if __name__ == "__main__":
    main()

Fetching posts from r/Rosacea from 2020-01-01 to 2024-11-01...
Added post: Tips
Added post: Is it rosacea?? (Type 1)
Added post: ISO silicone-free hydrating moisturizer for dry rosacea skin under $15? 
Added post: Soolantra wash off
Added post: Tips for rosacea and make up?
Added post: Does VBeam laser treatment help with facial and ocular rosacea? 
Added post: Smoking socially and having rosacea. Anybody else in the same boat as me?
Added post: Thank you to this community 
Added post: Difficult managing flare. Need advice
Added post: Is this the beginning of rosacea/sun damage or damaged skin barrier? 
Added post: Triple cream for rosacea? Is this normal???
Added post: Normal for Triple cream for rosacea?
Added post: Ivermectin cream
Added post: Rosacea pustles after 2 months of using Soolantra!!
Added post: Opinions/Advice possible type 1
Added post: Could this be a rosacea flareup after fraxel laser?
Added post: What has helped with your type 1 the most?
Added post: Rosacea flare up