<a href="https://colab.research.google.com/github/AnshulAgrvl/Tesla_PMM/blob/main/Tesla.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install pandas requests

import requests
import pandas as pd
from urllib.parse import urlparse, parse_qs
from google.colab import files


In [None]:
YOUTUBE_API_KEY = "AIzaSyA6n9oZr5oqwLzKN-jf6oe0w3ePzjy-t-k"
BASE = "https://www.googleapis.com/youtube/v3"

URLS = [
    "https://www.youtube.com/watch?v=rJgffrenFCU&t=187s",
    "https://www.youtube.com/watch?v=b6B7muYkuXM",
    "https://www.youtube.com/watch?v=PBC4kkjS3P0",
    "https://www.youtube.com/watch?v=6ltU9q1pKKM",
]

# Assumption (edit if needed): first 2 are fanboy, last 2 are critical
LABELS = ["fanboy", "fanboy", "critical", "critical"]

BRAND = "Tesla"
MIN_COMMENTS_PER_VIDEO = 120  # assignment needs >100, keep buffer


In [None]:
def extract_video_id(url: str) -> str:
    parsed = urlparse(url)
    if parsed.netloc in ["youtu.be"]:
        return parsed.path.strip("/")
    qs = parse_qs(parsed.query)
    return qs.get("v", [None])[0]

def yt_get(endpoint: str, params: dict) -> dict:
    params = dict(params)
    params["key"] = YOUTUBE_API_KEY
    r = requests.get(f"{BASE}/{endpoint}", params=params, timeout=30)
    r.raise_for_status()
    return r.json()

def get_video_details(video_id: str) -> dict:
    data = yt_get("videos", {"part": "snippet,statistics", "id": video_id})
    if not data.get("items"):
        return {"video_id": video_id}

    item = data["items"][0]
    snippet = item.get("snippet", {})
    stats = item.get("statistics", {})

    return {
        "video_id": video_id,
        "brand": BRAND,
        "title": snippet.get("title"),
        "channel_title": snippet.get("channelTitle"),
        "published_at": snippet.get("publishedAt"),
        "view_count": int(stats.get("viewCount", 0)) if stats.get("viewCount") else None,
        "like_count": int(stats.get("likeCount", 0)) if stats.get("likeCount") else None,
        "comment_count_reported": int(stats.get("commentCount", 0)) if stats.get("commentCount") else None,
    }

def fetch_replies(parent_comment_id: str) -> list[dict]:
    rows = []
    page_token = None

    while True:
        params = {
            "part": "snippet",
            "parentId": parent_comment_id,
            "maxResults": 100,
            "textFormat": "plainText",
        }
        if page_token:
            params["pageToken"] = page_token

        data = yt_get("comments", params)

        for rep in data.get("items", []):
            sn = rep.get("snippet", {})
            rows.append({
                "comment_id": rep.get("id"),
                "parent_comment_id": parent_comment_id,
                "comment_type": "reply",
                "author": sn.get("authorDisplayName"),
                "text": sn.get("textDisplay"),
                "like_count": sn.get("likeCount"),
                "published_at": sn.get("publishedAt"),
                "updated_at": sn.get("updatedAt"),
            })

        page_token = data.get("nextPageToken")
        if not page_token:
            break

    return rows

def fetch_comments_with_replies(video_id: str, min_total: int = 120) -> pd.DataFrame:
    rows = []
    page_token = None

    while len(rows) < min_total:
        params = {
            "part": "snippet",
            "videoId": video_id,
            "maxResults": 100,
            "textFormat": "plainText",
            "order": "time",
        }
        if page_token:
            params["pageToken"] = page_token

        data = yt_get("commentThreads", params)

        for it in data.get("items", []):
            top = it["snippet"]["topLevelComment"]["snippet"]
            top_id = it["snippet"]["topLevelComment"]["id"]
            reply_count = it["snippet"].get("totalReplyCount", 0)

            rows.append({
                "comment_id": top_id,
                "parent_comment_id": None,
                "comment_type": "top",
                "author": top.get("authorDisplayName"),
                "text": top.get("textDisplay"),
                "like_count": top.get("likeCount"),
                "published_at": top.get("publishedAt"),
                "updated_at": top.get("updatedAt"),
            })

            # Pull replies so your total matches YouTube's "commentCount" more closely
            if reply_count and reply_count > 0:
                try:
                    rows.extend(fetch_replies(top_id))
                except Exception as e:
                    print(f"Reply fetch failed for parent {top_id}: {e}")

            if len(rows) >= min_total:
                break

        page_token = data.get("nextPageToken")
        if not page_token:
            break

    df = pd.DataFrame(rows).drop_duplicates(subset=["comment_id"])
    return df


In [None]:
video_ids = [extract_video_id(u) for u in URLS]
if any(v is None for v in video_ids):
    raise ValueError("One of the URLs did not contain a valid video id.")

videos = []
all_comments = []

for vid, label in zip(video_ids, LABELS):
    print("\nProcessing video:", vid, "label:", label)

    meta = get_video_details(vid)
    meta["label"] = label
    videos.append(meta)

    try:
        cdf = fetch_comments_with_replies(vid, min_total=MIN_COMMENTS_PER_VIDEO)
        cdf["video_id"] = vid
        cdf["brand"] = BRAND
        cdf["label"] = label
        all_comments.append(cdf)
        print("Pulled comment records:", len(cdf))
    except Exception as e:
        print("Comment pull failed for", vid, "Error:", e)

videos_df = pd.DataFrame(videos)
comments_df = pd.concat(all_comments, ignore_index=True) if all_comments else pd.DataFrame()

# Show data (so your output is clearly "data")
print("\nVIDEO METADATA SAMPLE")
display(videos_df.head())

print("\nCOMMENTS SAMPLE")
display(comments_df.head())

# Check requirement: >100 comments per video
print("\nCOMMENT COUNTS PER VIDEO (your grading check)")
display(comments_df.groupby("video_id")["comment_id"].count().reset_index(name="pulled_comment_records"))

# Save CSVs
videos_df.to_csv("tesla_videos.csv", index=False)
comments_df.to_csv("tesla_comments.csv", index=False)

print("\nSaved: tesla_videos.csv and tesla_comments.csv")

# Download files to your laptop
files.download("tesla_videos.csv")
files.download("tesla_comments.csv")



Processing video: rJgffrenFCU label: fanboy
Pulled comment records: 120

Processing video: b6B7muYkuXM label: fanboy
Pulled comment records: 121

Processing video: PBC4kkjS3P0 label: critical
Pulled comment records: 120

Processing video: 6ltU9q1pKKM label: critical
Pulled comment records: 120

VIDEO METADATA SAMPLE


Unnamed: 0,video_id,brand,title,channel_title,published_at,view_count,like_count,comment_count_reported,label
0,rJgffrenFCU,Tesla,10 Ways Tesla Makes Life Easier Than Other Cars,Vegas Tesla Carmen,2025-09-09T19:00:57Z,203512,3364,263,fanboy
1,b6B7muYkuXM,Tesla,On The Fence Buying a Tesla? Watch This First!,Jeremiah Jones,2025-03-07T17:00:47Z,393576,7139,708,fanboy
2,PBC4kkjS3P0,Tesla,The New Teslas Are a Bad Deal,WVFRM Podcast,2025-10-10T14:14:38Z,232535,5256,833,critical
3,6ltU9q1pKKM,Tesla,The Lie So Dangerous Tesla Engineers Are Quitting,More Perfect Union,2025-09-04T19:14:41Z,1664409,63171,10716,critical



COMMENTS SAMPLE


Unnamed: 0,comment_id,parent_comment_id,comment_type,author,text,like_count,published_at,updated_at,video_id,brand,label
0,UgyP-XFboanu53VIKe54AaABAg,,top,@vegasteslacarmen,WTF ðŸ¤¬ sorry guys - a lot of my B roll videos e...,26,2025-09-09T22:33:11Z,2025-09-09T22:52:20Z,rJgffrenFCU,Tesla,fanboy
1,UgyP-XFboanu53VIKe54AaABAg.AMs-fmWkkDjAMsC3_CL-g8,UgyP-XFboanu53VIKe54AaABAg,reply,@stevepriority4219,"It's ok, it brought back some good PowerPoint ...",1,2025-09-10T00:21:26Z,2025-09-10T00:21:26Z,rJgffrenFCU,Tesla,fanboy
2,UgyP-XFboanu53VIKe54AaABAg.AMs-fmWkkDjAMsXCLkHaSQ,UgyP-XFboanu53VIKe54AaABAg,reply,@SRTNicky,VLLO would NEVER,0,2025-09-10T03:26:07Z,2025-09-10T03:26:07Z,rJgffrenFCU,Tesla,fanboy
3,UgyP-XFboanu53VIKe54AaABAg.AMs-fmWkkDjAMsmvlXQOZT,UgyP-XFboanu53VIKe54AaABAg,reply,@NotTheOneToBeMessedWith,Maybe try DaVinci Resolve. its free. And maybe...,0,2025-09-10T05:52:16Z,2025-09-10T05:52:16Z,rJgffrenFCU,Tesla,fanboy
4,UgyP-XFboanu53VIKe54AaABAg.AMs-fmWkkDjAMssOYm-hAt,UgyP-XFboanu53VIKe54AaABAg,reply,@vegasteslacarmen,@NotTheOneToBeMessedWith I always check before...,1,2025-09-10T06:40:02Z,2025-09-10T06:40:02Z,rJgffrenFCU,Tesla,fanboy



COMMENT COUNTS PER VIDEO (your grading check)


Unnamed: 0,video_id,pulled_comment_records
0,6ltU9q1pKKM,120
1,PBC4kkjS3P0,120
2,b6B7muYkuXM,121
3,rJgffrenFCU,120



Saved: tesla_videos.csv and tesla_comments.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip -q install eventregistry pandas

from eventregistry import EventRegistry, QueryArticlesIter, ReturnInfo, ArticleInfoFlags
import pandas as pd
from google.colab import files

ER_API_KEY = "0ccdcae4-dfcb-4ffd-b320-f8d728a6e01f"   # from newsapi.ai dashboard
BRAND = "Tesla"

er = EventRegistry(apiKey=ER_API_KEY, allowUseOfArchive=False)

q = QueryArticlesIter(
    keywords=BRAND,
    lang="eng",              # English only
    dataType=["news"]
)

rows = []
max_items = 20

for art in q.execQuery(
    er,
    sortBy="date",
    maxItems=max_items,
    returnInfo=ReturnInfo(articleInfo=ArticleInfoFlags(bodyLen=3000))  # increase if you want more text
):
    # Safety filter (should already be eng because of lang="eng")
    if art.get("lang") != "eng":
        continue

    source_title = None
    if isinstance(art.get("source"), dict):
        source_title = art["source"].get("title")
    else:
        source_title = art.get("source")

    rows.append({
        "brand": BRAND,
        "article_uri": art.get("uri"),
        "title": art.get("title"),
        "published_at": art.get("dateTime") or art.get("date"),
        "source": source_title,
        "url": art.get("url"),
        "lang": art.get("lang"),
        "body_text": art.get("body")   # this is the article text (often an excerpt)
    })

news_df = pd.DataFrame(rows)

print("English Tesla articles pulled:", len(news_df))
display(news_df[["title", "published_at", "source", "lang"]].head(10))
display(news_df[["body_text"]].head(2))

news_df.to_csv("tesla_news.csv", index=False)
print("Saved: tesla_news.csv")
files.download("tesla_news.csv")


English Tesla articles pulled: 20


Unnamed: 0,title,published_at,source,lang
0,Mazda unveils the new CX-6e,2026-01-12T12:57:41Z,GameReactor,eng
1,Tesla seeks engineer to make its iOS Robotaxi ...,2026-01-12T12:52:42Z,TESLARATI,eng
2,"China, EU agree on steps to resolve their disp...",2026-01-12T12:41:38Z,cnbctv18.com,eng
3,Here's how much Tesla stock insiders sold in 2026,2026-01-12T12:36:45Z,Finbold,eng
4,Security and AI news from the week beginning 5...,2026-01-12T12:34:58Z,Enterprise Times,eng
5,"EU, China agree on steps to resolve tariffs an...",2026-01-12T12:28:39Z,Business Standard,eng
6,The Boar,2026-01-12T12:28:18Z,The Boar,eng
7,"Restored Republic via a GCR as of January 12, ...",2026-01-12T12:28:10Z,Operation Disclosure Official,eng
8,Canada broke its electric vehicle market in 20...,2026-01-12T12:27:39Z,geekfence.com,eng
9,LUMKA MLAMBO AND KETSO GORDHAN: SA's tech futu...,2026-01-12T12:26:03Z,Business Day,eng


Unnamed: 0,body_text
0,It would be a bit of understatement to say tha...
1,It appears that Tesla is hard at work in ensur...


Saved: tesla_news.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>