# fetch mkbhd data

In [None]:
import pandas as pd
import requests
import yt_dlp
from youtube_transcript_api import YouTubeTranscriptApi
import csv
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# YouTube Data API Key
api_key = os.getenv('API_KEY')


# Load CSV containing video links
csvFile = 'links.csv'  # Path to your CSV file
df = pd.read_csv(csvFile)

# List to store metadata for all videos
video_data = []

# Function to save captions to a text file
def save_captions_to_txt(video_id, captions_text):
    with open(f'dataset/captions/{video_id}_captions.txt', 'w', encoding='utf-8') as f:
        f.write(f'Video ID: {video_id}\n\n')
        f.write(captions_text)
        
# Function to extract video ID from YouTube short link
def getVideoId(url):
    import re
    pattern = r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/shorts\/([A-Za-z0-9_-]{11})'
    match = re.match(pattern, url)
    return match.group(1) if match else None

# Function to fetch metadata using YouTube Data API
def fetchMetadata(videoId, apiKey):
    url = f'https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics,contentDetails&id={videoId}&key={apiKey}'
    response = requests.get(url)
    return response.json()

# Function to download video and captions using yt-dlp
def downloadVideoAndCaptions(videoUrl, videoId):
    # Create dataset folder if it doesn't exist
    output_dir = 'dataset'
    os.makedirs(output_dir, exist_ok=True)

    # Download video in MP4 format
    ydlOptsVideo = {
        'format': 'mp4',
        'outtmpl': f'{output_dir}/videos/{videoId}.mp4',  # Save the video as 'dataset/videoId.mp4'
    }
    with yt_dlp.YoutubeDL(ydlOptsVideo) as ydl:
        ydl.download([videoUrl])

    # Download captions if available
    ydlOptsSubtitles = {
        'writesubtitles': True,
        'subtitlesformat': 'vtt',
        'skip_download': True,
        'outtmpl': f'{output_dir}/captions_vtt/captions_vtt_{videoId}.vtt',  # Save the subtitles as 'dataset/videoId.vtt'
        'subtitleslangs': ['en'],  # Force download of English subtitles
    }
    with yt_dlp.YoutubeDL(ydlOptsSubtitles) as ydl:
        ydl.download([videoUrl])

# Function to get captions for a video using YouTubeTranscriptApi
def get_captions(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript
    except Exception as e:
        print(f"Error retrieving captions for video {video_id}: {str(e)}")
        return None

# Iterate through each link in the CSV and process
for index, row in df.iterrows():
    print(f'short number - >  {index+1}')
    videoUrl = row['shorts_links']
    videoId = getVideoId(videoUrl)
    
    if videoId:
        # Fetch metadata
        metadata = fetchMetadata(videoId, apiKey)
        
        if 'items' in metadata and len(metadata['items']) > 0:
            # Extract relevant fields from metadata
            videoTitle = metadata['items'][0]['snippet']['title']
            videoDescription = metadata['items'][0]['snippet']['description']
            viewCount = metadata['items'][0]['statistics']['viewCount']
            likeCount = metadata['items'][0]['statistics'].get('likeCount', 0)
            commentCount = metadata['items'][0]['statistics'].get('commentCount', 0)
            captionsAvailable = metadata['items'][0]['contentDetails'].get('caption', 'false')

            # Fetch captions
            captions = get_captions(videoId)
            if captions:
                captions_text = ' '.join([item['text'] for item in captions])
            else:
                captions_text = 'No captions available'

            # Save captions to a text file
            save_captions_to_txt(videoId, captions_text)
            
            # Append the metadata to the list
            video_data.append({
                'Video URL': videoUrl,
                'Video ID': videoId,
                'Title': videoTitle,
                'Description': videoDescription,
                'View Count': viewCount,
                'Like Count': likeCount,
                'Comment Count': commentCount,
                'Captions Available': captionsAvailable,
                'Captions Text': captions_text
            })

            # Optionally download video and captions
            downloadVideoAndCaptions(videoUrl, videoId)
        else:
            print(f"Metadata not found for Video ID: {videoId}")

    else:
        print(f"Invalid URL: {videoUrl}")

# Create a DataFrame from the metadata list
video_df = pd.DataFrame(video_data)

# Save the DataFrame to a CSV file
output_csv_file = 'dataset/extracted_video_metadata.csv'
video_df.to_csv(output_csv_file, index=False)

print(f"Metadata saved to {output_csv_file}")


short number - >  1
[youtube] Extracting URL: https://www.youtube.com/shorts/Cq1sQmFJBF0
[youtube] Cq1sQmFJBF0: Downloading webpage
[youtube] Cq1sQmFJBF0: Downloading ios player API JSON
[youtube] Cq1sQmFJBF0: Downloading mweb player API JSON
[youtube] Cq1sQmFJBF0: Downloading m3u8 information
[info] Cq1sQmFJBF0: Downloading 1 format(s): 18
[download] Destination: dataset/videos/Cq1sQmFJBF0.mp4
[download] 100% of    3.61MiB in 00:00:01 at 2.16MiB/s   
[youtube] Extracting URL: https://www.youtube.com/shorts/Cq1sQmFJBF0
[youtube] Cq1sQmFJBF0: Downloading webpage
[youtube] Cq1sQmFJBF0: Downloading ios player API JSON
[youtube] Cq1sQmFJBF0: Downloading mweb player API JSON
[youtube] Cq1sQmFJBF0: Downloading m3u8 information
[info] Cq1sQmFJBF0: Downloading 1 format(s): 616+251
[info] There are no subtitles for the requested languages
short number - >  2
[youtube] Extracting URL: https://www.youtube.com/shorts/6y-GZp_UUUo
[youtube] 6y-GZp_UUUo: Downloading webpage
[youtube] 6y-GZp_UUUo: Dow

[youtube] EaRWEEXMXEA: Downloading mweb player API JSON
[youtube] EaRWEEXMXEA: Downloading m3u8 information
[info] EaRWEEXMXEA: Downloading 1 format(s): 616+251
[info] There are no subtitles for the requested languages
short number - >  11
[youtube] Extracting URL: https://www.youtube.com/shorts/CToL2qkCd8g
[youtube] CToL2qkCd8g: Downloading webpage
[youtube] CToL2qkCd8g: Downloading ios player API JSON
[youtube] CToL2qkCd8g: Downloading mweb player API JSON
[youtube] CToL2qkCd8g: Downloading m3u8 information
[info] CToL2qkCd8g: Downloading 1 format(s): 18
[download] Destination: dataset/videos/CToL2qkCd8g.mp4
[download] 100% of    4.13MiB in 00:00:03 at 1.07MiB/s   
[youtube] Extracting URL: https://www.youtube.com/shorts/CToL2qkCd8g
[youtube] CToL2qkCd8g: Downloading webpage
[youtube] CToL2qkCd8g: Downloading ios player API JSON
[youtube] CToL2qkCd8g: Downloading mweb player API JSON
[youtube] CToL2qkCd8g: Downloading m3u8 information
[info] CToL2qkCd8g: Downloading 1 format(s): 616+

[download] 100% of    4.26MiB in 00:00:06 at 681.62KiB/s 
[youtube] Extracting URL: https://www.youtube.com/shorts/roheTIg6pmk
[youtube] roheTIg6pmk: Downloading webpage
[youtube] roheTIg6pmk: Downloading ios player API JSON
[youtube] roheTIg6pmk: Downloading mweb player API JSON
[youtube] roheTIg6pmk: Downloading m3u8 information
[info] roheTIg6pmk: Downloading 1 format(s): 616+251-1
[info] There are no subtitles for the requested languages
short number - >  21
[youtube] Extracting URL: https://www.youtube.com/shorts/xSqol6GQRJw
[youtube] xSqol6GQRJw: Downloading webpage
[youtube] xSqol6GQRJw: Downloading ios player API JSON
[youtube] xSqol6GQRJw: Downloading mweb player API JSON
[youtube] xSqol6GQRJw: Downloading m3u8 information
[info] xSqol6GQRJw: Downloading 1 format(s): 18
[download] Destination: dataset/videos/xSqol6GQRJw.mp4
[download] 100% of    4.26MiB in 00:00:04 at 890.73KiB/s 
[youtube] Extracting URL: https://www.youtube.com/shorts/xSqol6GQRJw
[youtube] xSqol6GQRJw: Downl

[youtube] ltpbEQA8Fwc: Downloading mweb player API JSON
[youtube] ltpbEQA8Fwc: Downloading m3u8 information
[info] ltpbEQA8Fwc: Downloading 1 format(s): 18
[download] Destination: dataset/videos/ltpbEQA8Fwc.mp4
[download] 100% of    4.20MiB in 00:00:04 at 880.83KiB/s 
[youtube] Extracting URL: https://www.youtube.com/shorts/ltpbEQA8Fwc
[youtube] ltpbEQA8Fwc: Downloading webpage
[youtube] ltpbEQA8Fwc: Downloading ios player API JSON
[youtube] ltpbEQA8Fwc: Downloading mweb player API JSON
[youtube] ltpbEQA8Fwc: Downloading m3u8 information
[info] ltpbEQA8Fwc: Downloading 1 format(s): 616+251-2
[info] There are no subtitles for the requested languages
short number - >  31
[youtube] Extracting URL: https://www.youtube.com/shorts/1EvNX2_4zfE
[youtube] 1EvNX2_4zfE: Downloading webpage
[youtube] 1EvNX2_4zfE: Downloading ios player API JSON
[youtube] 1EvNX2_4zfE: Downloading mweb player API JSON
[youtube] 1EvNX2_4zfE: Downloading m3u8 information
[info] 1EvNX2_4zfE: Downloading 1 format(s): 18

[info] There are no subtitles for the requested languages
short number - >  40
[youtube] Extracting URL: https://www.youtube.com/shorts/UXB9bnuyZb4
[youtube] UXB9bnuyZb4: Downloading webpage
[youtube] UXB9bnuyZb4: Downloading ios player API JSON
[youtube] UXB9bnuyZb4: Downloading mweb player API JSON
[youtube] UXB9bnuyZb4: Downloading m3u8 information
[info] UXB9bnuyZb4: Downloading 1 format(s): 18
[download] Destination: dataset/videos/UXB9bnuyZb4.mp4
[download] 100% of    4.28MiB in 00:00:04 at 919.24KiB/s 
[youtube] Extracting URL: https://www.youtube.com/shorts/UXB9bnuyZb4
[youtube] UXB9bnuyZb4: Downloading webpage
[youtube] UXB9bnuyZb4: Downloading ios player API JSON
[youtube] UXB9bnuyZb4: Downloading mweb player API JSON
[youtube] UXB9bnuyZb4: Downloading m3u8 information
[info] UXB9bnuyZb4: Downloading 1 format(s): 313+251-2
[info] There are no subtitles for the requested languages
short number - >  41
[youtube] Extracting URL: https://www.youtube.com/shorts/DO1PJYAQj0c
[youtub

# get shorts links


In [11]:
#mkbhd has these amounts of shorts
5*14


70

# get all the top shorts of mkbhd

## get the html of short page first

(beware : commiting this file will expose the api key)

In [None]:
import requests

# URL of the YouTube Shorts page for MKBHD
url = 'https://www.youtube.com/@mkbhd/shorts'

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Save the response content to an HTML file
    with open('mkbhd_shorts_page.html', 'w', encoding='utf-8') as file:
        file.write(response.text)
    print("Webpage saved as mkbhd_shorts_page.html")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


## parse the short page

In [None]:
import re
import csv
# Read the content of the HTML file
html_file = 'mkbhd_shorts_page.html'  # Path to your HTML file

with open(html_file, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Regular expression to match the videoId pattern
video_id_pattern = r'"videoId":"([A-Za-z0-9_-]{11})"'

# Find all matches for the videoId in the HTML content
video_ids = re.findall(video_id_pattern, html_content)

# Use an ordered approach to remove duplicates while preserving the order
unique_video_ids = []
seen = set()

for video_id in video_ids:
    if video_id not in seen:
        unique_video_ids.append(video_id)
        seen.add(video_id)

# Print all unique extracted video IDs
print("Extracted unique video IDs (preserving order):")
for video_id in unique_video_ids:
    print(video_id)

# Save the unique video IDs as YouTube Shorts links in a CSV file
csv_filename = 'links.csv'

with open(csv_filename, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    # Write the header
    csv_writer.writerow(['shorts_links'])
    # Write each video link
    for video_id in unique_video_ids:
        csv_writer.writerow([f'https://www.youtube.com/shorts/{video_id}'])

print(f"Unique video links saved to {csv_filename}")
