# Upload the general questions dataset #

In [1]:
import pandas as pd

df = pd.read_csv("../../data/alexlab-storage-eu24-tk0__2024-09-16T08_59_07.939920__general.csv")

Verify that the data is correctly uploaded

In [None]:
df.head()

In [None]:
df.info()

I will just keep entries that have no-null entry

In [4]:
df = df.dropna()

In [None]:
df.info()

Drop data of media that is not in English

In [6]:
df_en = df[df["languages"] == "{'en'}"]

In [None]:
df_en.info()

Filling the transcripts folder

In [8]:
import json
import math
import re
import sys
from collections import defaultdict
from pathlib import Path
from typing import Callable  # Add this import
import requests


def group_by(lst: list, key_extractor: Callable):
    d = defaultdict(list)
    for item in lst:
        d[key_extractor(item)].append(item)
    return d


def get_transcripts_for_tiktok_video(video_id: str, transcripts_dir: Path):
    video_url = f"https://www.tiktok.com/@unknown/video/{video_id}"
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0"
    headers = {
        "User-Agent": user_agent,
        "Referer": "https://www.tiktok.com/",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Upgrade-Insecure-Requests": "1"
    }

    print(f"Fetching video URL: {video_url}")
    try:
        response = requests.get(video_url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching video URL: {e}")
        return None

    html_content = response.text

    # Extracting the JSON object from the HTML file
    json_match = re.search(r'(?<="__DEFAULT_SCOPE__":)[^<]*', html_content)
    if not json_match:
        print("JSON data not found in the HTML content.")
        return None

    json_data = json.loads(json_match.group(0).strip()[:-1])  # manually removing last character
    transcripts_infos = json_data["webapp.video-detail"]["itemInfo"]["itemStruct"]["video"]["subtitleInfos"]

    language_code_priority = [
        "eng-US",
        "fra-FR",
        "deu-DE",
        "spa-ES",
    ]
    subtitle_infos_by_format = group_by(transcripts_infos, lambda info: info["Format"])
    
    captions = ""
    
    for subtitle_format, infos_list in subtitle_infos_by_format.items():
        sorted_transcripts_infos_list = sorted(transcripts_infos,
                                               key=lambda info: language_code_priority.index(
                                                   info["LanguageCodeName"]) if
                                               info["LanguageCodeName"] in language_code_priority else math.inf)
        transcripts_info = sorted_transcripts_infos_list[0]
        url = transcripts_info["Url"]
        language = transcripts_info["LanguageCodeName"]
        source = transcripts_info["Source"]

        suffix = "vtt" if subtitle_format == "webvtt" else "json" if subtitle_format == "creator_caption" else None

        filename = f"{video_id}_{subtitle_format}_{language}_{source}"
        if suffix:
            filename += f".{suffix}"
        try:
            file_response = requests.get(url, headers=headers)
            file_response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Failed to download transcripts for video {video_id}, language {language}: {e}")
            continue

        # Storing content of the vtt file directly in the captions
        captions = file_response.text

        # Save file to disk (optional, if you need it later)
        video_dir = transcripts_dir / str(video_id)

        video_dir.mkdir(exist_ok=True)
        with open(video_dir / filename, "wb+") as f:
            f.write(file_response.content)
            print(f"Saved file: {video_dir / filename}")
    
    return captions

def add_captions_to_dataframe(df):
    
    # Remove undesired lines of the vtt file
    # INPUT: Pandas df with a column named "video_id"
    # OUTPUT: String containing just the text on a single line
        
    for video_id in df['video_id']:
        get_transcripts_for_tiktok_video(video_id, Path('./transcripts'))

    return df

In [None]:
df_test = df_en.iloc[:4].copy()
df_test = add_captions_to_dataframe(df_test)

Adding the "captions" column

In [10]:
import os
from pathlib import Path

def add_captions_to_df_vtt(df):
    captions = []
    
    for video_id in df['video_id']:
        # Find all VTT files for the given video_id. Since the video can end in different codes it is needed to end the direction with an *
        vtt_files = list(Path(f'./transcripts/{video_id}').glob(f'{video_id}_webvtt_*.vtt'))
        
        if not vtt_files:
            print(f"Warning: No VTT files found for video {video_id}.")
            captions.append("")  # Append empty caption if no files are found
            continue
        
        # Use the first VTT file found (shouldn't be needed since every video just generates one caption)
        vtt_file = vtt_files[0]
        
        # Check if the file is empty
        if os.path.getsize(vtt_file) == 0:
            print(f"Warning: VTT file for video {video_id} is empty.")
            captions.append("")  # Append empty caption if the file is empty
            continue
        
        try:
            with open(vtt_file, 'r', encoding='utf-8') as file:
                vtt_lines = file.readlines()
                
                # Remove lines containing timestamps and 'WEBVTT' or if it contains a timestamp
                caption_lines = [line.strip() for line in vtt_lines if '-->' not in line and line.strip() != 'WEBVTT']
                
                # Join the caption lines into one string
                captions_text = ' '.join(caption_lines)
                captions.append(captions_text)
        
        except Exception as e:
            print(f"Unexpected error for video {video_id}: {e}")
            captions.append("")  # Append an empty caption for any other errors

    df['captions'] = captions
    return df


In [None]:
df_test = add_captions_to_df_vtt(df_test)
df_test.head()

In [None]:
df_test = df_test[df_test["captions"] != ""]
df_test.head()