# Instructions:
- Check the instructions in the README.md in the root of this repository
- Run all code in sequence
- :D

In [5]:
from bs4 import BeautifulSoup
import re
import pandas as pd
import glob
import os
import requests
import openpyxl  # TODO: is this used?

In [6]:
debug_mode = False

# Loading All the HTML files

In [17]:
# Create directory to store the html files
os.makedirs('html', exist_ok=True)
# Use glob to find all the html files in the html directory
html_files = glob.glob('google_html/*.html')

# Obtain all company names, their website URLs, and GPS coordinates, from the html files

In [None]:
for html_file in html_files:
    # Load html content from a file "IT companies south east Drenthe.html"
    with open(html_file, encoding='utf-8') as file:
        html_content = file.read()

        # Create a BeautifulSoup object
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find all parent elements that contain both the title and the URL
        parent_elements = soup.find_all("div", class_="qBF1Pd fontHeadlineSmall")

        company_names = []  # These are named titles in Google Maps jargon
        company_urls = []
        seen_urls = {}  # There was a bug in the code that caused duplicate URLs to be added to the list, so we need to keep track of the URLs we have already seen and filter those out. Not the pretiest solution, but it works.
        for parent in parent_elements:
            title = re.sub(' +', ' ', parent.get_text().replace('\n', '').strip())
            link_tag = parent.find_next("a", class_="lcr4fd S9kvJb")
            url = link_tag['href'] if link_tag and 'href' in link_tag.attrs else None
            if url in seen_urls:
                company_urls[seen_urls[url]] = ""
            seen_urls[url] = len(company_urls)
            company_names.append(title)
            company_urls.append(url)

        if debug_mode: print("Company names:", company_names)
        if debug_mode: print("Company URLs:", company_urls)

        # Find the href in the a tag with class "hfpxzc"
        google_urls = [link['href'] for link in soup.find_all("a", class_="hfpxzc")]
        if debug_mode: print("Google URLs:", google_urls)

        # Use regular expressions to find latitude and longitude
        latitude = [re.search(r'!3d([-.\d]+)', url) for url in google_urls]
        longitude = [re.search(r'!4d([-.\d]+)', url) for url in google_urls]

        # Extract and convert to float
        latitude = [float(lat.group(1)) if lat else None for lat in latitude]
        longitude = [float(lon.group(1)) if lon else None for lon in longitude]

        # Print the GPS coordinates, for debugging purposes
        gps_coordinates = [(lat, long) for lat, long in zip(latitude, longitude)]
        if debug_mode: print("GPS Coordinates:", gps_coordinates)

        # Put the data into a pandas DataFrame
        df = pd.DataFrame({'Company name': company_names, 'Company URL': company_urls, 'Google URL': google_urls, 'Latitude': latitude, 'Longitude': longitude})
        if debug_mode: print(df)

        # Create directory to store a csv file per html file
        os.makedirs('csv', exist_ok=True)

        # Save the output to a csv file in the csv directory, using the name of the html file
        csv_file = os.path.basename(html_file).replace('.html', '.csv')
        df.to_csv(os.path.join('csv', csv_file), index=False)

# Visit all Company URLs, save their frontpage as html, and see if they have a description

In [25]:
# Load all csv files in the csv directory
csv_files = glob.glob('csv/*.csv')

for csv in csv_files:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv)
    descriptions = []
    
    # Iterate over each URL in the 'Company URL' column
    for url, company_name in zip(df['Company URL'], df['Company name']):
        if url:
            try:
                # Send a GET request to the URL
                response = requests.get(url)
                if response.status_code == 200:
                    # Parse the HTML content
                    html_content = response.text
                    soup = BeautifulSoup(html_content, 'html.parser')
                    # Save the html content in the html directory, make folder if it doesn't exist
                    path = os.path.join('html', os.path.basename(csv).replace('.csv', ''))
                    path = os.path.join(path, company_name + '.html').replace('|', '')
                    os.makedirs(os.path.dirname(path), exist_ok=True)
                    with open(path, 'w', encoding='utf-8') as file:
                        file.write(html_content)
                    # Find the meta tag with the name 'description'
                    description = soup.find("meta", attrs={"name": "description"})
                    # Append the content of the description meta tag to the descriptions list
                    descriptions.append(description['content'] if description else None)
                else:
                    descriptions.append(None)
            except requests.RequestException as e:
                # Print the error message if the request fails
                if debug_mode: print(f"Request failed for URL: {url}, error: {e}")
                descriptions.append(None)
        else:
            descriptions.append(None)

    # Add the descriptions list as a new column in the DataFrame
    df['Description'] = descriptions

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csv, index=False)

# Convert the CSVs to one Excel

In [71]:
# Load all csv files in the csv directory
csv_files = glob.glob('csv/*.csv')

# Create an Excel file, that will contain all csv files, each in a separate sheet
with pd.ExcelWriter('Companies.xlsx') as writer:
    for csv in csv_files:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(csv)
        # Get the name of the CSV file (without the directory and extension)
        sheet_name = os.path.splitext(os.path.basename(csv))[0]
        # Write the DataFrame to the Excel file
        df.to_excel(writer, sheet_name=sheet_name, index=False)


# Extract text from the downloaded html frontpages, put them in csvs

In [33]:
htmls = glob.glob("html/*/*.html")

def extract_text_from_html(html_content):
    """
    Extracts all human-readable text from an HTML document.
    """
    soup = BeautifulSoup(html_content, "html.parser")

    # Remove script and style elements
    for tag in soup(["script", "style", "noscript", "meta", "link", "head"]):
        tag.extract()

    # Get text and clean up unnecessary whitespace
    text = soup.get_text(separator=" ", strip=True)

    return text

# List to store extracted texts
data = []

# Loop through HTML files and extract text
for html in htmls:
    with open(html, "r", encoding="utf-8") as file:
        html_content = file.read()
    extracted_text = extract_text_from_html(html_content).replace("\n", " ").replace("\r", " ")
    
    # Append to data list
    data.append({"Filename": os.path.basename(html), "ExtractedText": extracted_text})

# Convert list to DataFrame
df = pd.DataFrame(data)

# Save DataFrame to CSV
df.to_csv("extracted_texts.csv", index=False, encoding="utf-8")

In [31]:
# open extracted_texts.csv and read the content
df = pd.read_csv("extracted_texts.csv")
df.head()

Unnamed: 0,Filename,ExtractedText
0,IoT Nederland.html,IoT Nederland Home Hoe werkt het Wat doen wij ...
1,Let Things Talk BV.html,"0512-208000 Office Ampérelaan 3, 9207 AM, Drac..."
2,The Things Network.html,We are a global collaborative Internet of Thin...
3,Thingsdata.html,Skip to content +31 (0)85-0443500 info@thingsd...
4,BACE IoT.html,Smart Energy Body Monitoring Compliance Report...
