In [7]:
import pandas as pd
import time
from datetime import datetime
from tqdm.notebook import tqdm  # progress bar in Jupyter

import os, json, re
from pathlib import Path

import nest_asyncio, asyncio
from twscrape import API
from twscrape import gather

## Scraping basic video information in X

start from simple Format

twitterlink | # likes | # shares

In [2]:
def load_cookies_to_string(path: str) -> str:
    """
    Accepts Netscape 'cookies.txt' or a JSON list of cookies.
    Returns a single 'name=value; name2=value2; ...' cookie string.
    """
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Cookie file not found: {p.resolve()}")

    text = p.read_text(encoding="utf-8").strip()

    # Heuristic: JSON or Netscape?
    if text.startswith("[") or text.startswith("{"):
        # JSON format (list of dicts with 'name' and 'value')
        data = json.loads(text)
        # Some exporters wrap in {"cookies": [...]}:
        if isinstance(data, dict) and "cookies" in data:
            data = data["cookies"]
        pairs = []
        for c in data:
            name = c.get("name")
            value = c.get("value")
            if name and value is not None:
                pairs.append(f"{name}={value}")
        return "; ".join(pairs)

    # Netscape cookies.txt format (tab-separated columns)
    # Skip comments (#...), keep last 2 columns as name/value
    pairs = []
    for line in text.splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        parts = line.split("\t")
        # Netscape spec: domain, flag, path, secure, expiration, name, value
        if len(parts) >= 7:
            name, value = parts[-2], parts[-1]
            if name and value is not None:
                pairs.append(f"{name}={value}")
    if not pairs:
        raise ValueError("No cookies parsed. Make sure the file is for x.com and not empty.")
    return "; ".join(pairs)


In [4]:
nest_asyncio.apply()

In [5]:
async def init_api(cookies_path="cookies.txt", db_path="accounts.db", label="my_cookie_account"):
    cookie_str = load_cookies_to_string(cookies_path)
    api = API(db_path)  # creates/uses a local sqlite db for sessions
    # Add (or replace) an account backed by cookies
    # username/password placeholders are fine when cookies are provided
    try:
        # If account label already exists, remove then add (keeps this idempotent)
        acc = await api.pool.get_account(label)
        await api.pool.delete_account(label)
    except Exception:
        pass
    await api.pool.add_account(label, "-", "-", "-", cookies=cookie_str)
    await api.pool.login_all()  # validate/refresh session
    return api

In [None]:
api = asyncio.run(init_api("cookies.txt"))  # or "cookies.json" if that's what you exported
api

In [29]:
# Customize your search here:
QUERY = 'has:videos -is:retweet -is:reply lang:en'   
LIMIT = 5      
DELAY = 2
PRODUCT = "Media"
OUT_CSV = "videos_twscrape.csv"

def iter_media(media_obj):
    """Yield media objects regardless of whether tweet.media is single, list, or has .all."""
    if not media_obj:
        return
    # Already an iterable (list/tuple/etc.)
    if hasattr(media_obj, "__iter__") and not isinstance(media_obj, (str, bytes)):
        # Some twscrape versions expose .all; prefer it if present
        if hasattr(media_obj, "all") and media_obj.all:
            for m in media_obj.all:
                yield m
        else:
            for m in media_obj:
                yield m
        return
    # Has a .all attribute (collection wrapper)
    if hasattr(media_obj, "all") and media_obj.all:
        for m in media_obj.all:
            yield m
        return
    # Fallback: single media object
    yield media_obj

def has_video_media(tweet):
    for m in iter_media(getattr(tweet, "media", None)):
        if getattr(m, "type", "") in ("video", "animated_gif"):
            return True
    return False

async def search_videos_to_csv(api, query=QUERY, limit=LIMIT, product=PRODUCT, out_csv=OUT_CSV):
    tweets = await gather(api.search(query, limit=limit, kv={"product": product}))

    rows = []
    for tw in tweets:
        if not has_video_media(tw):
            continue
        rows.append({
            "twitterlink": f"https://x.com/{tw.user.username}/status/{tw.id}",
            "username": tw.user.username,
            "content": tw.content,
            "date": tw.date,
            "likes": tw.likeCount,
            "shares": tw.retweetCount,
            "replies": tw.replyCount,
            "tweet_id": tw.id,  # handy for dedup
        })
        time.sleep(DELAY)

    df = pd.DataFrame(rows)
    if df.empty:
        print("No matching tweets with video found.")
        return

    df.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"Saved {len(df)} rows to {out_csv}")

In [30]:
awaitable = search_videos_to_csv(api)

In [31]:
asyncio.run(awaitable)

No matching tweets with video found.


## Above does not work
#### snscrape libaray: outdated for twitter post
#### twscrape library: did not figure out

## Decide to collect video post mannuelly:

#### Procedure
1. look for video post

2. download video through `TwitterVideoDownloader`

3. extract audio through `Restream` (Has hourly limit)

4. record down links, captions, likes, shares, post_time, collect_time, transcription in a `twitterVideo.csv`

5. video/audio files are uploaded to Google Drive

6. transcription files are stored in transcript folder 


Transcription extraction basing on whisper from openai:

- Using `TurboScribe` (Limited Daily Free times)
    - setting{Max Words Per Segment:8, Max Duration Per Segment (Seconds): 10, Max Characters Per Segment: 80}
    - in VTT format

- send it to whisper using API (NOT FREE)