https://www.ft.dk/aktuelt/webtv/video/20222/salen/5.aspx?from=28-11-2022&to=19-12-2022&selectedMeetingType=Salen&committee=&as=1#player


In [17]:
import requests # Downloading webpages
from bs4 import BeautifulSoup # Extracting data from html files
import json
import pandas as pd
import re
from datetime import datetime
from moviepy.editor import VideoFileClip
from pandas import DataFrame
from requests import Response

In [40]:
def extract_transcript(req: Response) -> pd.DataFrame:
    """
    Extracts transcript data from a given Response object.

    This function parses the HTML content of a Response object to extract 
    names, texts, and IDs, and then structures this data into a pandas DataFrame. 
    It filters out any rows where the ID is 'N/A'.

    Parameters:
    - req (Response): The Response object obtained from a web request.

    Returns:
    - pd.DataFrame: A pandas DataFrame containing columns for 'Name', 'Text', and 'ID'.

    Example:
    >>> url = "http://example.com/transcripts"
    >>> req = requests.get(url)
    >>> df = extract_transcript(req)
    >>> print(df.head())
    """
        
    soup = BeautifulSoup(req.text, 'html.parser') # Parse the HTML as a string

    # Initialize lists to store the extracted data
    names = []
    texts = []
    numbers = []

    # Find all divs with class 'video-item-referat' - this is one speach
    for div in soup.find_all("div", class_="video-item-referat"):
        # Extract name and clean it
        name_div = div.find("div", class_="name")
        name = name_div.get_text(strip=True)
        name = re.sub(r'\([^)]*\)', '', name) # Remove everything within parentheses and parentheses themselves
        #name = name.replace("Formanden", "") # Remove "Formanden"
        name = name.strip() # Remove whitespace at beginning and end
        #name = name.replace("ø", "oe") # Replace ø with oe

        # Extract texts from all <p> with class 'Tekst' and 'TekstIndryk' - this is the transcript
        text_elements = div.find_all("p", class_=["Tekst", "TekstIndryk"])
        combined_text = ' '.join(p.get_text(strip=True) for p in text_elements)

        # Extract number (from id of 'video-item-content') - this is and ID for the speach
        content_div = div.find("div", class_="video-item-content")
        number = content_div['id'].split('_')[-1] if content_div and 'id' in content_div.attrs else 'N/A'

        # Append to lists
        names.append(name)
        texts.append(combined_text)
        numbers.append(number)

    # Create DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Text': texts,
        'ID': numbers
    })

    # Remove rows that contain 'N/A'
    df = df[df.ID != 'N/A']

    # Drop the first row
    df = df.drop(df.index[0])

    # Return DataFrame
    return df

In [31]:
def old_extract_timestamps(req: Response, video_start_clock: str) -> pd.DataFrame:

    soup = BeautifulSoup(req.text, 'html.parser') # Parse the HTML as a string

    script_tag = soup.find('script', text=re.compile('SpeachItem')) # Find the script tag containing the timestamps
    speach_items = re.findall(r'new SpeachItem\(.*\)', script_tag.string) # Extract all instances of SpeachItem
    speach_items = [re.sub(r'new SpeachItem', '', item) for item in speach_items] # Remove 'new SpeachItem'

    # loop over speach_items and extract the timestamps
    start_times = []
    end_times = []
    start_times_clock = []
    end_times_clock = []
    id = []

    video_start_clock_datetime = datetime.strptime(video_start_clock, '%H:%M:%S') # time first speach starts

    for item in speach_items:
        speach_start_clock = item[19:27]
        speach_end_clock = item[41:49]

        # Convert to datetime
        speach_start_clock_datetime = datetime.strptime(speach_start_clock, '%H:%M:%S')
        speach_end_clock_datetime = datetime.strptime(speach_end_clock, '%H:%M:%S')

        # Calculate the timedelta (difference) from the video start
        speach_start_timestamp = (speach_start_clock_datetime - video_start_clock_datetime).total_seconds()
        speach_end_timestamp = (speach_end_clock_datetime - video_start_clock_datetime).total_seconds()

        # Append the original times and calculated seconds
        start_times_clock.append(speach_start_clock)
        end_times_clock.append(speach_end_clock)
        start_times.append(speach_start_timestamp)
        end_times.append(speach_end_timestamp)
        id.append(item[58:-2])

    # Create DataFrame
    time_stamps = pd.DataFrame({
        'Start': start_times,
        'End': end_times,
        'StartClock': start_times_clock,
        'EndClock': end_times_clock,
        'ID': id
    })
    return time_stamps

In [41]:
def extract_timestamps(req: Response) -> pd.DataFrame:

    soup = BeautifulSoup(req.text, 'html.parser') # Parse the HTML as a string

    script_tag = soup.find('script', text=re.compile('SpeachItem')) # Find the script tag containing the timestamps
    speach_items = re.findall(r'new SpeachItem\(.*\)', script_tag.string) # Extract all instances of SpeachItem
    speach_items = [re.sub(r'new SpeachItem', '', item) for item in speach_items] # Remove 'new SpeachItem'

    # loop over speach_items and extract the timestamps
    start_times = []
    end_times = []
    start_times_clock = []
    end_times_clock = []
    id = []
    duration = []

    for item in speach_items:
        speach_start_clock = item[19:27]
        speach_end_clock = item[41:49]

        # Convert to datetime
        speach_start_clock_datetime = datetime.strptime(speach_start_clock, '%H:%M:%S')
        speach_end_clock_datetime = datetime.strptime(speach_end_clock, '%H:%M:%S')

        # Calculate the timedelta (difference) from the video start
        seconds = (speach_end_clock_datetime - speach_start_clock_datetime).total_seconds()

        # Append the original times and calculated seconds
        start_times_clock.append(speach_start_clock)
        end_times_clock.append(speach_end_clock)
        duration.append(seconds)
        id.append(item[58:-2])

    # Create DataFrame
    time_stamps = pd.DataFrame({
        'StartClock': start_times_clock,
        'EndClock': end_times_clock,
        'Duration': duration,
        'ID': id
    })

    # Drop the first row
    time_stamps = time_stamps.iloc[1:]
    return time_stamps

In [20]:
url = "https://www.ft.dk/aktuelt/webtv/video/20222/salen/5.aspx?from=28-11-2022&to=19-12-2022&selectedMeetingType=Salen&committee=&as=1#player"
req = requests.get(url)

Unnamed: 0,Name,Text,ID
0,Formanden Søren Gade,Der er som bekendt ingen minister på dette omr...,912053
1,Jens Joel,"Tak for det, formand. Og ja, det er rigtigt, a...",912064
2,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",912075
3,Thomas Danielsen,"Tak for det, formand. Venstre bifalder EU-refo...",912078
4,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",912079


In [42]:
transcripts = extract_transcript(req)
transcripts.head()

Unnamed: 0,Name,Text,ID
1,Jens Joel,"Tak for det, formand. Og ja, det er rigtigt, a...",912064
2,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",912075
3,Thomas Danielsen,"Tak for det, formand. Venstre bifalder EU-refo...",912078
4,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",912079
5,Henrik Frandsen,"Mange tak, hr. formand. Jeg vil gerne starte m...",912080


In [43]:
time_stamps = extract_timestamps(req)
time_stamps.head()

Unnamed: 0,StartClock,EndClock,Duration,ID
1,10:01:28,10:06:41,313.0,912064
2,10:06:41,10:06:49,8.0,912075
3,10:06:49,10:07:25,36.0,912078
4,10:07:25,10:07:36,11.0,912079
5,10:07:36,10:09:15,99.0,912080


In [45]:
# the next part has some manual labor
# first, go to the website and find the time of day the video starts
meeting_begin = '09:59:34'
# Then find the time of day this person starts talking:
print(f'{transcripts.iloc[0]["Name"]}: {transcripts.iloc[0]["Text"]}')
first_speaker_clock = '10:01:22'

Jens Joel: Tak for det, formand. Og ja, det er rigtigt, at vi nu behandler beslutningsforslag B 4 om opfølgning på visse af EU-reformgruppens anbefalinger og ændring af Færøudvalgets og Grønlandsudvalgets retsgrundlag. Hvis man starter med EU-reformgruppen, kan man nævne, at vi i Europaudvalget og i Folketinget i juni 2020 besluttede at nedsætte en ekspertgruppe, der skulle kigge på, hvordan vi arbejdede med EU-sagerne, nu hvor det faktisk i disse måneder er 50 år siden, vi blev medlem af samarbejdet i Europa; og der er næppe nogen tvivl om, at vi har et velfungerende Europaudvalg. Vi har også haft et Europaudvalg, som har dannet skole for nogle andre parlamenter rundtomkring i Europa, men vi har også kunnet se, at vi har gjort det på den samme måde i 50 år, og at der måske var behov for at justere nogle ting. Særligt har vi i Europaudvalget og i den ekspertgruppe, der så har siddet og kigget på det, konstateret, at vi nogle gange – og for ofte – var i en situation, hvor vi kom lidt fo

In [68]:

first_timestamp = time_stamps.iloc[0, 0]
print(type(first_timestamp))
first_speaker_clock = '10:01:22'
first_speaker_clock = datetime.strptime(first_speaker_clock, '%H:%M:%S')
print(type(first_speaker_clock))




<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'datetime.datetime'>


In [23]:
#combine the two dataframes on the number column
df = pd.merge(transcripts, time_stamps, on='ID')
# drop the number column
df = df.drop(columns=['ID'])

In [24]:
df

Unnamed: 0,Name,Text,Start,End,StartClock,EndClock
0,Formanden Søren Gade,Der er som bekendt ingen minister på dette omr...,70.0,107.0,10:00:51,10:01:28
1,Jens Joel,"Tak for det, formand. Og ja, det er rigtigt, a...",107.0,420.0,10:01:28,10:06:41
2,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",420.0,428.0,10:06:41,10:06:49
3,Thomas Danielsen,"Tak for det, formand. Venstre bifalder EU-refo...",428.0,464.0,10:06:49,10:07:25
4,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",464.0,475.0,10:07:25,10:07:36
5,Henrik Frandsen,"Mange tak, hr. formand. Jeg vil gerne starte m...",475.0,574.0,10:07:36,10:09:15
6,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",574.0,589.0,10:09:15,10:09:30
7,Karina Lorentzen Dehnhardt,"Det her er jo et af de forslag, som der, efter...",589.0,628.0,10:09:30,10:10:09
8,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",628.0,646.0,10:10:09,10:10:27
9,Peter Skaarup,Jeg skal også på Danmarksdemokraternes vegne t...,646.0,705.0,10:10:27,10:11:26


In [None]:
video_start_clock = "09:59:41"