In [29]:
import os
import json
import hashlib
from pathlib import Path
from typing import List, Dict
import pandas as pd
import requests
from dotenv import load_dotenv, find_dotenv
from huggingface_hub import InferenceClient
from youtube_comment_downloader import YoutubeCommentDownloader

# Load environment variables
load_dotenv(find_dotenv())

os.environ['HUGGING_FACE_API_KEY'] = os.getenv("HUGGING_FACE_API_KEY")
os.environ['YOUTUBE_API_KEY'] = os.getenv("YOUTUBE_API_KEY")

# Hugging Face API setup
client = InferenceClient(
    model="mistralai/Mistral-Nemo-Instruct-2407",
    token=os.getenv("HUGGING_FACE_API_KEY")
)

# YouTube API setup
api_key = os.getenv("YOUTUBE_API_KEY")

# Define the YouTubeVideoURL model
class YouTubeVideoURL:
    def __init__(self, video_title: str, video_url: str):
        self.video_title = video_title
        self.video_url = video_url

    def pretty_print(self) -> None:
        print(f"Title: {self.video_title}")
        print(f"URL: {self.video_url}")

# Function to ask LLM to generate search titles
def generate_search_titles(prompt: str) -> List[str]:
    try:
        response = client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=100,
            stream=False
        )
        raw_titles = response['choices'][0]['message']['content'].split('\n')
        
        # Clean the titles: remove numbers, special characters, and quotes
        titles = [
            title.strip()
            .lstrip("1234567890.- ")  # Remove numbering and leading characters
            .replace('“', '')
            .replace('”', '')
            .replace('"', '')  # Remove any remaining quotes
            for title in raw_titles if title.strip()
        ]
        print(titles)
        return titles
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Function to search for YouTube video links based on search titles
def search_videos(query: str, max_results: int = 15) -> List[YouTubeVideoURL]:
    search_url = f"https://www.googleapis.com/youtube/v3/search?part=snippet&type=video&q={query}&maxResults={max_results}&key={api_key}"
    print(search_url)
    response = requests.get(search_url).json()
    videos = []
    
    for item in response.get('items', []):
        video_title = item['snippet']['title']
        video_id = item['id']['videoId']
        video_url = f"https://www.youtube.com/watch?v={video_id}"
        videos.append(YouTubeVideoURL(video_title, video_url))
    
    return videos

# Function to download comments from YouTube videos
def download_comments_from_url(downloader: YoutubeCommentDownloader, url: str, top_n: int = 100000, min_len: int = 25) -> List[str]:
    comments = downloader.get_comments_from_url(url, sort_by=0)  # sort_by=0 for 'most popular'
    result = []
    count = 0
    for comment in comments:
        text = comment["text"].strip()
        if len(text) < min_len:
            continue
        result.append(text)
        count += 1
        if count >= top_n:
            break
    return result

# Function to save comments to a JSON file, including video title
def save_comments(video_title: str, url: str, comments: List[str], data_dir: str = "data") -> None:
    url_md5 = hashlib.md5(url.encode()).hexdigest()
    filename = f"{data_dir}/{url_md5}.json"
    if Path(filename).is_file():
        print(f"{url} already processed")
        return
    if len(comments) < 5:
        print(f"Not enough comments {len(comments)}")
        return
    print(f"Downloaded {len(comments)} comments from {url}")
    data_to_save = {"title": video_title, "url": url, "comments": comments}
    with open(filename, "w") as writer:
        json.dump(data_to_save, writer, indent=4)

def main():
    # Step 1: Ask the LLM to generate search titles
    search_prompt = """Generate search titles for YouTube videos related to Indian two-wheeler reviews, feedback, and launches.
    eg: Indian Hero Karizma ZMr Review, Bajaj Pulsar Review India, Yamaha R15 launch, etc."""
    search_titles = generate_search_titles(search_prompt)
    
    if not search_titles:
        print("No search titles were generated.")
        return
    
    # Initialize counters
    title_counters = []
    
    # Step 2: Generate relevant YouTube video links based on search titles
    video_links = []
    for title in search_titles:
        videos = search_videos(title, max_results=5)
        video_links.extend(videos)
        title_counters.append({"Search Title": title, "Number of Videos": len(videos)})
    
    if not video_links:
        print("No video links were generated.")
        return
    
    # Initialize comment counters
    comment_counters = []
    
    # Step 3: Download comments from the generated YouTube video links
    downloader = YoutubeCommentDownloader()
    for video in video_links:
        comments = download_comments_from_url(downloader, video.video_url)
        save_comments(video.video_title, video.video_url, comments)
        comment_counters.append({"Video Title": video.video_title, "Number of Comments": len(comments)})
    
    # Convert counters to DataFrames
    title_df = pd.DataFrame(title_counters)
    comment_df = pd.DataFrame(comment_counters)
    
    print("\nSearch Titles and Number of Videos:")
    print(title_df)
    
    print("\nVideos and Number of Comments:")
    print(comment_df)
    
    # Save DataFrames to CSV files
    title_df.to_csv("title_counters.csv", index=False)
    comment_df.to_csv("comment_counters.csv", index=False)

if __name__ == "__main__":
    main()


['**Honda CB350 Himalayan S dreaming Review India**', '**Royal Enfield Meteor 350 vs Classic 350 – Which is Better?**', '**New TVS Apache RTR 200 4V Review – Is it the New King of 200cc?**', '**Suzuki Gixxer 250 launched in India – Your First Look**', '**KTM 20']
https://www.googleapis.com/youtube/v3/search?part=snippet&type=video&q=**Honda CB350 Himalayan S dreaming Review India**&maxResults=5&key=AIzaSyBaJyxakkKsZrfHwB-1mG80BFlV1Q4XTYI
https://www.googleapis.com/youtube/v3/search?part=snippet&type=video&q=**Royal Enfield Meteor 350 vs Classic 350 – Which is Better?**&maxResults=5&key=AIzaSyBaJyxakkKsZrfHwB-1mG80BFlV1Q4XTYI
https://www.googleapis.com/youtube/v3/search?part=snippet&type=video&q=**New TVS Apache RTR 200 4V Review – Is it the New King of 200cc?**&maxResults=5&key=AIzaSyBaJyxakkKsZrfHwB-1mG80BFlV1Q4XTYI
https://www.googleapis.com/youtube/v3/search?part=snippet&type=video&q=**Suzuki Gixxer 250 launched in India – Your First Look**&maxResults=5&key=AIzaSyBaJyxakkKsZrfHwB-1m

In [25]:
query = "Hero bike review"
search_videos(query, max_results= 15) 

https://www.googleapis.com/youtube/v3/search?part=snippet&type=video&q=Hero bike review&maxResults=15&key=AIzaSyBaJyxakkKsZrfHwB-1mG80BFlV1Q4XTYI


[]