<a href="https://colab.research.google.com/github/Evanson12/Data-Analysis-for-OEWG-Transcripts/blob/main/HTML_file_for_OEWG_transcripts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Read the HTML file
from bs4 import BeautifulSoup, Comment  # Ensure Comment is imported

# Path to the file in your Google Drive
html_file_path = '/content/drive/My Drive/HTML/oewg7.html'

# Read the HTML content
with open(html_file_path, 'r', encoding='ISO-8859-1') as file:
    html_content = file.read()

# Step 3: Extract the required information using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

speakers = []
timestamps = []
statements = []
added_statements = set()  # Set to track added statements and prevent duplicates

# Extracting all the blocks containing the speakers and timestamps
for div in soup.find_all('div', class_='wp-block-columns is-not-stacked-on-mobile'):
    # Extract Speaker
    speaker_tag = div.find('h5', class_='wp-block-heading')
    if speaker_tag:
        speakers.append(speaker_tag.get_text(separator=' ', strip=True))
    else:
        speakers.append('')  # Append empty string if no speaker is found

    # Extract Timestamp
    timestamp_tag = div.find('p', class_='has-text-align-right')
    if timestamp_tag:
        timestamps.append(timestamp_tag.get_text(separator=' ', strip=True))
    else:
        timestamps.append('')  # Append empty string if no timestamp is found

# Extracting the statements separately
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
    if 'wp:paragraph' in comment:
        # Get the next <p> tag after the comment that has no class
        p_tag = comment.find_next_sibling('p')
        if p_tag and not p_tag.get('class'):  # Ensure the <p> tag has no class attribute
            statement = p_tag.get_text(separator=' ', strip=True)
            if statement not in added_statements:
                statements.append(statement)
                added_statements.add(statement)  # Mark this statement as added

# Ensure all lists are of the same length by padding with empty strings
max_length = max(len(speakers), len(timestamps), len(statements))

while len(speakers) < max_length:
    speakers.append('')

while len(timestamps) < max_length:
    timestamps.append('')

while len(statements) < max_length:
    statements.append('')

# Create a DataFrame with the extracted data
import pandas as pd

data = {
    'Speaker': speakers,
    'Timestamp': timestamps,
    'Statement': statements
}

df = pd.DataFrame(data)

# Display the extracted data
print(df.head())

# Step 4: Save the extracted data to a CSV file with a numbered filename
file_number = +1  # Update this number manually for each run

# Path to save the extracted data
extracted_data_path = f'/content/drive/My Drive/HTML/extracted_data_{file_number}.csv'

# Save the extracted data to a CSV file
df.to_csv(extracted_data_path, index=False)
print(f"Extracted data saved to 'extracted_data_{file_number}.csv'.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
             Speaker Timestamp  \
0  Ambassador Gafoor     00:00   
1               Cuba     00:50   
2  Ambassador Gafoor     07:29   
3         Bangladesh     07:35   
4  Ambassador Gafoor     11:31   

                                           Statement  
0  Distinguished delegates, the eighth meeting of...  
1  Chair, the changing nature of information and ...  
2  Thank you very much, Cuba. Bangladesh to be fo...  
3  Thank you, Mr. Chair. Bangladesh believes that...  
4  Thank you, Bangladesh. Belgium to be followed ...  
Extracted data saved to 'extracted_data_1.csv'.
