# Instructions:
- Check the instructions in the README.md in the root of this repository
- Run all code in sequence
- :D

In [60]:
from bs4 import BeautifulSoup
import re
import pandas as pd
import glob
import os
import requests
import openpyxl


# Loading All the HTML files

In [22]:
# Create directory to store the html files
os.makedirs('html', exist_ok=True)
# Use glob to find all the html files in the html directory
html_files = glob.glob('html/*.html')

# Obtain all company names, their website URLs, and GPS coordinates, from the html files

In [None]:
for html_file in html_files:
    # Load html content from a file "IT companies south east Drenthe.html"
    with open(html_file, encoding='utf-8') as file:
        html_content = file.read()

        # Create a BeautifulSoup object
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find all parent elements that contain both the title and the URL
        parent_elements = soup.find_all("div", class_="qBF1Pd fontHeadlineSmall")

        company_names = []  # These are named titles in Google Maps jargon
        company_urls = []
        seen_urls = {}  # There was a bug in the code that caused duplicate URLs to be added to the list, so we need to keep track of the URLs we have already seen and filter those out. Not the pretiest solution, but it works.
        for parent in parent_elements:
            title = re.sub(' +', ' ', parent.get_text().replace('\n', '').strip())
            link_tag = parent.find_next("a", class_="lcr4fd S9kvJb")
            url = link_tag['href'] if link_tag and 'href' in link_tag.attrs else None
            if url in seen_urls:
                company_urls[seen_urls[url]] = ""
            seen_urls[url] = len(company_urls)
            company_names.append(title)
            company_urls.append(url)

        print("Company names:", company_names)
        print("Company URLs:", company_urls)

        # Find the href in the a tag with class "hfpxzc"
        google_urls = [link['href'] for link in soup.find_all("a", class_="hfpxzc")]
        print("Google URLs:", google_urls)

        # Use regular expressions to find latitude and longitude
        latitude = [re.search(r'!3d([-.\d]+)', url) for url in google_urls]
        longitude = [re.search(r'!4d([-.\d]+)', url) for url in google_urls]

        # Extract and convert to float
        latitude = [float(lat.group(1)) if lat else None for lat in latitude]
        longitude = [float(lon.group(1)) if lon else None for lon in longitude]

        # Print the GPS coordinates, for debugging purposes
        gps_coordinates = [(lat, long) for lat, long in zip(latitude, longitude)]
        print("GPS Coordinates:", gps_coordinates)

        # Put the data into a pandas DataFrame
        df = pd.DataFrame({'Company name': company_names, 'Company URL': company_urls, 'Google URL': google_urls, 'Latitude': latitude, 'Longitude': longitude})
        print(df)

        # Create directory to store a csv file per html file
        os.makedirs('csv', exist_ok=True)

        # Save the output to a csv file in the csv directory, using the name of the html file
        csv_file = os.path.basename(html_file).replace('.html', '.csv')
        df.to_csv(os.path.join('csv', csv_file), index=False)

Company names: ['Flink IT', 'Consult IT!', 'Mones IT', 'Quality ICT', 'Koopmans IT', 'ATL IT Diensten', 'ADJUST-IT', 'sense-it BV', 'Emit IT', 'Vechtdal IT', 'Reverse IT', 'Uptodate-IT', 'Veteris IT Services', 'Aalders IT', 'ASM-IT', 'Ready4IT', 'Hoogeveen Laptop Center (Solutions IT Drenthe)', 'MKByte IT services', 'Arcus IT Drenthe', 'Flink It Bv', 'W&T IT Solutions UG', 'Visolity', 'Jellema ITrainingen', 'NLS-IT', 'i-Beheer ®', 'IT-Memo', 'MisterIggy', 'IT Hub', 'Nova IT', 'Just ICT', 'ICTDrenthe', 'MKB IT', 'BinC2 B.V.', 'Business IT-Rent B.V.', 'IT Worxx', 'Gerr-IT', 'Focus ICT B.V.', 'CDM-iT', 'Netwarc ICT Services', 'Kiestra.com', 'Avalon IT', 'BITS Dalen', 'Spreen ICT', 'Heeyoo Hoogeveen', 'IT services', 'I-Datacenter B.V.', 'Gouman IT Solutions', 'Y A F S E C', 'IT Emporium', 'Ping IT Works', 'Akkro ICT', 'Reality ICT', 'ICT Prima', 'Service ICT Sil Sieljes', 'Doornbos Agra-IT', 'Qontrol-it B.V.', 'Ziva IT BV', 'ASM-IT', 'Hartmann Automatisering BV', 'Vuturo ICT B.V.', 'Roma h

# Visit all Company URLs and see if they have a description

In [58]:
# Load all csv files in the csv directory
csv_files = glob.glob('csv/*.csv')

for csv in csv_files:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv)
    descriptions = []
    
    # Iterate over each URL in the 'Company URL' column
    for url in df['Company URL']:
        if url:
            try:
                # Send a GET request to the URL
                response = requests.get(url)
                if response.status_code == 200:
                    # Parse the HTML content
                    html_content = response.text
                    soup = BeautifulSoup(html_content, 'html.parser')
                    # Find the meta tag with the name 'description'
                    description = soup.find("meta", attrs={"name": "description"})
                    # Append the content of the description meta tag to the descriptions list
                    descriptions.append(description['content'] if description else None)
                else:
                    descriptions.append(None)
            except requests.RequestException as e:
                # Print the error message if the request fails
                print(f"Request failed for URL: {url}, error: {e}")
                descriptions.append(None)
        else:
            descriptions.append(None)

    # Add the descriptions list as a new column in the DataFrame
    df['Description'] = descriptions

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csv, index=False)

Request failed for URL: nan, error: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Request failed for URL: nan, error: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Request failed for URL: nan, error: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Request failed for URL: nan, error: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Request failed for URL: nan, error: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Request failed for URL: nan, error: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Request failed for URL: nan, error: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Request failed for URL: nan, error: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Request failed for URL: https://itemporium.eu/, error: HTTPSConnectionPool(host='itemporium.eu', port=443): Max retries exceeded with url: / (Caused by 

# Convert the CSVs to one Excel

In [None]:
# Load all csv files in the csv directory
csv_files = glob.glob('csv/*.csv')

# Create an Excel file, that will contain all csv files, each in a separate sheet
with pd.ExcelWriter('companies.xlsx') as writer:
    for csv in csv_files:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(csv)
        # Get the name of the CSV file (without the directory and extension)
        sheet_name = os.path.splitext(os.path.basename(csv))[0]
        # Write the DataFrame to the Excel file
        df.to_excel(writer, sheet_name=sheet_name, index=False)
