**Factory latlong web scraper (by type of factory)**

Data from กรมโรงงานอุตสาหกรรม at https://www.diw.go.th/webdiw/search-factory/

In [None]:
#important setting
fac_type = "00503" #รหัสประเภทโรงงาน from https://www.diw.go.th/datahawk/factype.php
#optional settings
sleep_time1 = 5 #time to wait for page to load after clicking page number, default = 5 (value too low may cause errors)
sleep_time2 = 1 #time to wait between extracting each link, default = 1 (value too low may cause server to block IP)

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

In [None]:
web_data = []

#open main website, input factory type, and click search
driver = webdriver.Chrome()
driver.get('https://www.diw.go.th/webdiw/search-factory/')
wait = WebDriverWait(driver, 30)
original_window = driver.current_window_handle

#accept cookie button
element = driver.find_element(By.XPATH, '//*[@id="cn-accept-cookie"]').click()

#input factory type and so on
element = driver.find_element(By.XPATH, '//*[@id="content"]/article/div/form/table/tbody/tr[5]/td[2]/input')
element.send_keys(fac_type)
time.sleep(2)
element2 = driver.find_element(By.XPATH, '/html/body/div[1]/div/main/div/div/div/article/div/form/table/tbody/tr[16]/td/input[2]').click()

wait.until(EC.number_of_windows_to_be(2))

for window_handle in driver.window_handles:
    if window_handle != original_window:
        driver.switch_to.window(window_handle)
        break
wait.until(EC.title_is("Factory Results From Query"))


#start scraping data on new tab

#find number of pages
page_no = driver.find_elements(By.NAME, "pageno")
num_pages = 1
for i in page_no:
    num_pages+=1
    text = i.text 
print("Number of pages: ", num_pages, " Time spent per page: ", sleep_time1, 'seconds')


elems = driver.find_elements(By.XPATH, "//a[@href]")
for elem in elems:
    web_data.append(elem.get_attribute("href"))


#scrape all links on all pages using for loop
for i in range(2, num_pages + 1):
    print("Extracting from page: ", i)
    xpath_selector = f"//input[@name='pageno' and @value='{i}']"
    current_page = driver.find_element(By.XPATH, xpath_selector).click()
    time.sleep(sleep_time1)

    elems = driver.find_elements(By.XPATH, "//a[@href]")
    for elem in elems:
        web_data.append(elem.get_attribute("href"))
driver.quit()
print("Link extraction complete! Now will proceed to extract factory data.")

#remove unwanted links
web_data = [sub for sub in web_data if not any(ele in sub for ele in ['userdb.diw.go.th/query.html'])]
print("Number of links: ", len(web_data), " Time spent per link: ", sleep_time2, 'seconds')

#beautifulsoup for factory lat long extraction
wdata = []
for url in web_data:
    time.sleep(sleep_time2)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    soup = soup.find("body")
    num = 0
    for line in soup:
        line = str(line)
        if num == 0: #fac_id
            fac_id = line.split(" ")[2]
        elif num == 1: #lat long
            latlong = line.split('N')
            latlong = [item.replace(" ", "") for item in latlong]
            latlong = latlong[-1].split("<")[0]
            latlong = latlong.split('E')
        elif num == 4: #fac_name
            fac_name = line.split(":")[-1].strip()
        num += 1
    wdata.append([fac_id, fac_name, fac_type, latlong[0], latlong[1], url])

#export result to excel
print("Exporting data to excel...")
fac_data = pd.DataFrame(wdata, columns=['fac_id', 'fac_name', 'fac_type', 'lat', 'long', 'link'])
fac_data.to_excel(f'factory_latlong_type{fac_type}.xlsx', index=False)

print("Factories extraction complete! with a total number of ", len(fac_data), " factories")
print(fac_data)
