https://www.ft.dk/aktuelt/webtv/video/20222/salen/5.aspx?from=28-11-2022&to=19-12-2022&selectedMeetingType=Salen&committee=&as=1#player


In [1]:
import requests # Downloading webpages
from bs4 import BeautifulSoup # Extracting data from html files
import json
import pandas as pd
import re
from datetime import datetime
from moviepy.editor import VideoFileClip
from pandas import DataFrame
from requests import Response

In [9]:
def extract_transcript(req: Response) -> pd.DataFrame:
    '''
    Extracts transcript data from a given Response object.

    This function parses the HTML content of a Response object to extract 
    names, texts, and IDs, and then structures this data into a pandas DataFrame. 
    It filters out any rows where the ID is 'N/A'.
    The first row is also dropped, as it hard to define when this speach starts.

    Parameters:
    - req (Response): The Response object obtained from a web request.

    Returns:
    - pd.DataFrame: A pandas DataFrame containing columns for 'Name', 'Text', and 'ID'.
    '''
        
    soup = BeautifulSoup(req.text, 'html.parser') # Parse the HTML as a string

    # Initialize lists to store the extracted data
    names = []
    texts = []
    numbers = []

    # Find all divs with class 'video-item-referat' - this is one speach
    for div in soup.find_all("div", class_="video-item-referat"):
        # Extract name and clean it
        name_div = div.find("div", class_="name")
        name = name_div.get_text(strip=True)
        name = re.sub(r'\([^)]*\)', '', name) # Remove everything within parentheses and parentheses themselves
        name = name.strip() # Remove whitespace at beginning and end

        # Extract texts from all <p> with class 'Tekst' and 'TekstIndryk' - this is the transcript
        text_elements = div.find_all("p", class_=["Tekst", "TekstIndryk"])
        combined_text = ' '.join(p.get_text(strip=True) for p in text_elements)

        # Extract number (from id of 'video-item-content') - this is and ID for the speach
        content_div = div.find("div", class_="video-item-content")
        number = content_div['id'].split('_')[-1] if content_div and 'id' in content_div.attrs else 'N/A'

        # Append to lists
        names.append(name)
        texts.append(combined_text)
        numbers.append(number)

    # Create DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Text': texts,
        'ID': numbers
    })

    # Remove rows that contain 'N/A'
    df = df[df.ID != 'N/A']

    # Drop the first row
    df = df.drop(df.index[0])

    # Return DataFrame
    return df

In [7]:
def extract_duration(req: Response) -> pd.DataFrame:
    '''
    Extracts duration of speaches from a given Response object.
    
    Parameters:
    - req (Response): The Response object obtained from a web request.

    Returns:
    - pd.DataFrame: A pandas DataFrame containing columns for 'duration' and 'ID'.
    '''

    soup = BeautifulSoup(req.text, 'html.parser') # Parse the HTML as a string

    script_tag = soup.find('script', text=re.compile('SpeachItem')) # Find the script tag containing the timestamps
    speach_items = re.findall(r'new SpeachItem\(.*\)', script_tag.string) # Extract all instances of SpeachItem
    speach_items = [re.sub(r'new SpeachItem', '', item) for item in speach_items] # Remove 'new SpeachItem'

    # loop over speach_items and extract the timestamps
    id = []
    duration = []

    for item in speach_items:
        # Get start and end clock
        speach_start_clock = item[19:27]
        speach_end_clock = item[41:49]

        # Convert to datetime
        speach_start_clock_datetime = datetime.strptime(speach_start_clock, '%H:%M:%S')
        speach_end_clock_datetime = datetime.strptime(speach_end_clock, '%H:%M:%S')

        # Calculate the timedelta (difference) (duration)
        seconds = (speach_end_clock_datetime - speach_start_clock_datetime).total_seconds()

        # Append duration and ID to lists
        duration.append(seconds)
        id.append(item[58:-2])

    # Create DataFrame
    durations = pd.DataFrame({
        'duration': duration,
        'ID': id
    })

    # Drop the first row
    durations = durations.iloc[1:]
    return durations

In [16]:
def to_min_sec(seconds: int) -> str:
    '''
    Converts seconds to minutes and seconds.
    '''
    minutes = seconds // 60
    sec = seconds % 60
    return f"{minutes}:{sec:02d}"

In [17]:
def get_timestamps(df, initial_start_time):
    '''
    Calculates the start and end timestamps for each row in a DataFrame.

    Parameters:
    - df (DataFrame): The DataFrame containing the 'duration' column.

    Returns:
    - DataFrame: A pandas DataFrame containing columns for 'start' and 'end'.
    '''
    
    # Calculate the cumulative sum of 'Duration' for the 'end' column and add the initial start time
    df['end'] = df['duration'].cumsum() + initial_start_time

    # Shift the 'end' column to create the 'start' column, and initialize the first value with the initial start time
    df['start'] = df['end'].shift(1).fillna(initial_start_time)

    # Convert 'start' and 'end' columns to integer if necessary
    df['start'] = df['start'].astype(int)
    df['end'] = df['end'].astype(int)

    # Apply the conversion to 'start' and 'end' and create new columns
    df['start_min'] = df['start'].apply(to_min_sec)
    df['end_min'] = df['end'].apply(to_min_sec)

    return df

In [5]:
url = "https://www.ft.dk/aktuelt/webtv/video/20222/salen/5.aspx?from=28-11-2022&to=19-12-2022&selectedMeetingType=Salen&committee=&as=1#player"
#req = requests.get(url)

In [10]:
transcripts = extract_transcript(req)
transcripts.head()

Unnamed: 0,Name,Text,ID
1,Jens Joel,"Tak for det, formand. Og ja, det er rigtigt, a...",912064
2,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",912075
3,Thomas Danielsen,"Tak for det, formand. Venstre bifalder EU-refo...",912078
4,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",912079
5,Henrik Frandsen,"Mange tak, hr. formand. Jeg vil gerne starte m...",912080


In [11]:
durations = extract_duration(req)
durations.head()

Unnamed: 0,duration,ID
1,313.0,912064
2,8.0,912075
3,36.0,912078
4,11.0,912079
5,99.0,912080


In [18]:
min = 1
sec = 49
start_time = min * 60 + sec

timestamps = get_timestamps(durations, start_time)

In [19]:
timestamps.head()

Unnamed: 0,duration,ID,end,start,start_min,end_min
1,313.0,912064,422,109,1:49,7:02
2,8.0,912075,430,422,7:02,7:10
3,36.0,912078,466,430,7:10,7:46
4,11.0,912079,477,466,7:46,7:57
5,99.0,912080,576,477,7:57,9:36


Få styr på timediff
Der skal i slutningen være en kolonne med secunder så man kan slice videoen
Lige nu er timestamps ikke sync 

In [20]:
# create new df, combine old_time_stamps and transcripts
df = pd.merge(transcripts, timestamps, on='ID')

In [21]:
df

Unnamed: 0,Name,Text,ID,duration,end,start,start_min,end_min
0,Jens Joel,"Tak for det, formand. Og ja, det er rigtigt, a...",912064,313.0,422,109,1:49,7:02
1,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",912075,8.0,430,422,7:02,7:10
2,Thomas Danielsen,"Tak for det, formand. Venstre bifalder EU-refo...",912078,36.0,466,430,7:10,7:46
3,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",912079,11.0,477,466,7:46,7:57
4,Henrik Frandsen,"Mange tak, hr. formand. Jeg vil gerne starte m...",912080,99.0,576,477,7:57,9:36
5,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",912081,15.0,591,576,9:36,9:51
6,Karina Lorentzen Dehnhardt,"Det her er jo et af de forslag, som der, efter...",912082,39.0,630,591,9:51,10:30
7,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",912083,18.0,648,630,10:30,10:48
8,Peter Skaarup,Jeg skal også på Danmarksdemokraternes vegne t...,912043,59.0,707,648,10:48,11:47
9,Formanden Søren Gade,"Tak for det. Der er ikke nogen, der har bedt o...",912044,10.0,717,707,11:47,11:57
