In [None]:
pip install google-api-python-client pandas




In [None]:
pip install youtube-transcript-api




Fetching Youtube data

In [None]:
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import re


class YouTubeQueryScraper:
    def __init__(self, api_key, query, max_results=500):
        self.api_key = api_key
        self.query = query
        self.max_results = max_results
        self.youtube = build("youtube", "v3", developerKey=api_key)

    def _print_df(self, df, name):
        print(f"\n==== {name} DataFrame ({df.shape[0]} rows, {df.shape[1]} columns) ====")
        print(df.head())
        print("======================================\n")

    # --------------------------------------------------------
    # EXTRACT LINKEDIN URLS
    # --------------------------------------------------------
    def extract_linkedin(self, text):
        if not text:
            return None
        pattern = r"https?://(?:www\.)?linkedin\.com/[^\s\)\]]+" #Used REGEX pattern matching
        matches = re.findall(pattern, text, flags=re.IGNORECASE)
        return list(set(matches)) if matches else None

    # --------------------------------------------------------
    # 1. SEARCH VIDEOS
    # --------------------------------------------------------
    def search_videos(self):
        print("Searching videos for query:", self.query)
        video_ids = []
        next_page = None

        while len(video_ids) < self.max_results:
            response = self.youtube.search().list(
                q=self.query,
                part="id",
                type="video",
                maxResults=50,
                pageToken=next_page
            ).execute()

            for item in response["items"]:
                video_ids.append(item["id"]["videoId"])

            next_page = response.get("nextPageToken")
            if not next_page:
                break

        print("Fetched", len(video_ids), "video IDs")
        self.video_ids = video_ids
        return video_ids

    # --------------------------------------------------------
    # 2. FETCH VIDEO DETAILS
    # --------------------------------------------------------
    def fetch_video_details(self):
        print("Fetching video metadata...")
        video_data = []

        for i in range(0, len(self.video_ids), 50):
            batch = self.video_ids[i:i+50]

            response = self.youtube.videos().list(
                part="snippet,statistics,contentDetails",
                id=",".join(batch)
            ).execute()

            for item in response["items"]:
                snip = item["snippet"]
                stats = item["statistics"]
                content = item["contentDetails"]

                linkedin_links = self.extract_linkedin(snip.get("description"))

                video_data.append({
                    "video_id": item["id"],
                    "title": snip.get("title"),
                    "description": snip.get("description"),
                    "linkedin_links": linkedin_links,
                    "published_at": snip.get("publishedAt"),
                    "tags": snip.get("tags"),
                    "channel_id": snip.get("channelId"),
                    "channel_title": snip.get("channelTitle"),
                    "views": stats.get("viewCount"),
                    "likes": stats.get("likeCount"),
                    "comments": stats.get("commentCount"),
                    "duration": content.get("duration")
                })

        self.df_videos = pd.DataFrame(video_data)
        self.video_data = video_data

        self._print_df(self.df_videos, "Videos")
        return self.df_videos

    # --------------------------------------------------------
    # 3. FETCH CHANNEL DETAILS
    # --------------------------------------------------------
    def fetch_channel_details(self):
        print("Fetching channel metadata...")
        channel_ids = list(set(v["channel_id"] for v in self.video_data))
        channel_data = []

        for i in range(0, len(channel_ids), 50):
            batch = channel_ids[i:i+50]

            response = self.youtube.channels().list(
                part="snippet,statistics",
                id=",".join(batch)
            ).execute()

            for item in response["items"]:
                snip = item["snippet"]
                stats = item["statistics"]

                linkedin_links = self.extract_linkedin(snip.get("description"))

                channel_data.append({
                    "channel_id": item["id"],
                    "channel_title": snip.get("title"),
                    "channel_description": snip.get("description"),
                    "subscribers": stats.get("subscriberCount"),
                    "country": snip.get("country"),
                    "total_videos": stats.get("videoCount"),
                    "subscribers": stats.get("subscriberCount"),
                    "total_views": stats.get("viewCount"),
                    "linkedin_links": linkedin_links
                })

        self.df_channels = pd.DataFrame(channel_data)
        self.channel_data = channel_data

        self._print_df(self.df_channels, "Channels")
        return self.df_channels

    # --------------------------------------------------------
    # 4. FETCH COMMENTS
    # --------------------------------------------------------
    def fetch_comments(self, filename="comments.xlsx"):
        print("Fetching comments...")
        all_comments = []

        for video in self.video_data:
            vid = video["video_id"]

            try:
                response = self.youtube.commentThreads().list(
                    part="snippet",
                    videoId=vid,
                    maxResults=100,
                    textFormat="plainText"
                ).execute()

                for item in response.get("items", []):
                    top = item["snippet"]["topLevelComment"]["snippet"]
                    all_comments.append({
                        "video_id": vid,
                        "comment": top.get("textDisplay")
                    })

            except Exception:
                all_comments.append({"video_id": vid, "comment": None})

        self.df_comments = pd.DataFrame(all_comments)
        self.df_comments.to_excel(filename, index=False)

        self._print_df(self.df_comments, "Comments")
        print("Saved:", filename)
        return self.df_comments

    # --------------------------------------------------------
    # 5. FETCH TRANSCRIPTS
    # --------------------------------------------------------
    def fetch_transcripts(self, filename="transcripts.xlsx"):
        print("Fetching transcripts...")
        transcript_rows = []

        for video in self.video_data:
            vid = video["video_id"]

            try:
                raw = YouTubeTranscriptApi.get_transcript(vid)
                text = " ".join([entry["text"] for entry in raw])
            except Exception:
                text = None

            transcript_rows.append({"video_id": vid, "transcript": text})

        self.df_transcripts = pd.DataFrame(transcript_rows)
        self.df_transcripts.to_excel(filename, index=False)

        self._print_df(self.df_transcripts, "Transcripts")
        print("Saved:", filename)
        return self.df_transcripts

    # --------------------------------------------------------
    # 6. SAVE VIDEO + CHANNEL EXCEL
    # --------------------------------------------------------
    def save_video_and_channel_excel(self):
        print("Saving Videos & Channels to Excel...")
        with pd.ExcelWriter("videos.xlsx") as writer:
            self.df_videos.to_excel(writer, index=False)

        with pd.ExcelWriter("channels.xlsx") as writer:
            self.df_channels.to_excel(writer, index=False)

        print("Saved videos.xlsx and channels.xlsx")

    # --------------------------------------------------------
    # 7. RUN EVERYTHING
    # --------------------------------------------------------
    def run(self):
        self.search_videos()
        self.fetch_video_details()
        self.fetch_channel_details()

        self.save_video_and_channel_excel()
        self.fetch_comments("comments.xlsx")
        self.fetch_transcripts("transcripts.xlsx")

        print("Pipeline complete: 4 Excel files generated.")


Youtube Data Complete Pipeline


In [None]:
API_KEY = "ADD YOUR API KEY PLEASE"
QUERY = "Data Science"

scraper = YouTubeQueryScraper(api_key=API_KEY, query="QUERY")
scraper.run()

#scraper.fetch_comments("youtube_comments.xlsx")
#scraper.fetch_transcripts("transcripts.xlsx")



In [None]:
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

URL_REGEX = r"(https?://[^\s]+)"

def extract_urls(text: str):
    return re.findall(URL_REGEX, text)


def classify_links_with_llm(urls):
    llm_input = (
        "Classify the following URLs into categories: "
        "linkedin, github, twitter, instagram, website, facebook, youtube, others.\n\n"
    )
    for u in urls:
        llm_input += f"- {u}\n"

    pipe = pipeline(
        "text-generation",
        model="facebook/opt-1.3b",   # lightweight free model
        max_new_tokens=150,
    )

    response = pipe(llm_input)[0]['generated_text']
    return response


def extract_pipeline(description):
    urls = extract_urls(description)
    result = classify_links_with_llm(urls)
    return urls, result


# Example
desc = """Welcome to my channel!
Follow me:
LinkedIn: https://linkedin.com/in/data-karishma
GitHub: https://github.com/dataqueen
Website: https://karishma-ai.com
"""

urls, categorized = extract_pipeline(desc)
print(urls)
print(categorized)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Device set to use cpu
Both `max_new_tokens` (=150) and `max_length`(=21) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


['https://linkedin.com/in/data-karishma', 'https://github.com/dataqueen', 'https://karishma-ai.com']
Classify the following URLs into categories: linkedin, github, twitter, instagram, website, facebook, youtube, others.

- https://linkedin.com/in/data-karishma
- https://github.com/dataqueen
- https://karishma-ai.com
- https://twitter.com/dataqueen
- https://instagram.com/dataqueen
- https://www.linkedin.com/in/data-karishma
- https://www.facebook.com/dataqueen
- https://www.instagram.com/dataqueen

- https://twitter.com/dataqueen
- https://www.linkedin.com/in/data-karishma
- https://www.twitter.com/dataqueen
- https://www.linkedin.com/in/data-karishma
- https://www.twitter.com/dataqueen
- https://www.facebook.com
