In [8]:
import requests
from bs4 import BeautifulSoup
import openpyxl
import os
from PIL import Image
from io import BytesIO
import pytesseract
import base64
from datetime import datetime
from dateutil.relativedelta import relativedelta


In [9]:
# Base URL of the target website
URL = 'https://www.tratencongty.com/'
START_PAGE = 1  # Starting page for scraping

# Define the fields to be extracted from the website
FIELDS = [
    'Tên công ty',
    'Loại hình hoạt động',
    'Mã số thuế',
    'Địa chỉ',
    'Đại diện pháp luật',
    'Ngày cấp giấy phép',
    'Ngày hoạt động',
    'Điện thoại trụ sở',
    'Trạng thái'
]
# Initialize an empty list to store the company data
data = []

# Set the User-Agent header for the HTTP requests
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}


In [10]:
def init_data():
    """
    Load existing data from the Excel file if it exists.
    Returns a list of dictionaries where each dictionary represents a row of data.
    """
    file_path = get_file_path()
    if not os.path.exists(file_path):
        return []
    
    workbook = openpyxl.load_workbook(file_path)
    sheet = workbook.active
    
    # Extract headers from the first row of the Excel sheet
    headers = [cell.value for cell in sheet[1]]
    
    # Load the data rows into a list of dictionaries
    data = []
    for row in sheet.iter_rows(min_row=2, values_only=True):
        row_dict = dict(zip(headers, row))
        data.append(row_dict)
    
    return data

def get_file_path():
    """
    Return the file path for the Excel file to save the data.
    """
    return "./data6.xlsx"

def write_file():
    """
    Write the scraped data to an Excel file, appending each company's information as a row.
    """
    file_path = get_file_path()
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.append(FIELDS)
    
    # Append each company's data as a row in the Excel sheet
    for company in data:
        row = [company.get(field, '') for field in FIELDS]
        sheet.append(row)
    
    workbook.save(file_path)


In [11]:
def company_exists(new_company):
    """
    Check if the company already exists in the data to avoid duplicates.
    Compares based on the company's name and tax ID.
    """
    for company in data:
        if company['Tên công ty'] == new_company['Tên công ty'] and company['Mã số thuế'] == new_company['Mã số thuế']:
            return True
    return False

def extract_text_from_base64_image(base64_string):
    """
    Decode a base64-encoded image string and use Tesseract OCR to extract text from it.
    Returns the extracted text.
    """
    base64_data = base64_string.split(',')[1]
    img_data = base64.b64decode(base64_data)
    img = Image.open(BytesIO(img_data))
    
    # Use Tesseract to extract text from the image
    text = pytesseract.image_to_string(img, config='--psm 6')
    return text.strip()


In [12]:
def get_company_data(url):
    """
    Scrape the company details from the provided URL and add them to the data list.
    """
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    company = {}
    jumbotron = soup.select_one('.jumbotron')
    
    if jumbotron:
        # Extract basic company details
        name = jumbotron.select_one('h4 a').text.strip()
        company['Tên công ty'] = name
        
        # Split the content based on <br/> tags to identify key information
        info_br = str(jumbotron).split('<br/>')
        for line in info_br:
            if 'Loại hình hoạt động' in line:
                company['Loại hình hoạt động'] = line.split('Loại hình hoạt động:', 1)[1].strip()
            elif 'Mã số thuế' in line:
                img_base64 = BeautifulSoup(line, 'html.parser').select_one('img')['src']
                company['Mã số thuế'] = extract_text_from_base64_image(img_base64)
            elif 'Địa chỉ' in line:
                company['Địa chỉ'] = line.split(':', 1)[1].strip()
            elif 'Đại diện pháp luật' in line:
                company['Đại diện pháp luật'] = line.split(':', 1)[1].strip()
            elif 'Ngày cấp giấy phép' in line:
                company['Ngày cấp giấy phép'] = line.split(':', 1)[1].strip()
            elif 'Ngày hoạt động' in line:
                date_part = BeautifulSoup(line, 'html.parser').text.split(':', 1)[1].strip()
                company['Ngày hoạt động'] = date_part
            elif 'Điện thoại trụ sở' in line:
                img_base64 = BeautifulSoup(line, 'html.parser').select_one('img')['src']
                company['Điện thoại trụ sở'] = extract_text_from_base64_image(img_base64)
            elif 'Trạng thái' in line:
                company['Trạng thái'] = line.split(':', 1)[1].strip()
        
        # Add the company URL for reference
        company['url'] = url
        
        # Add the company to the data list if it doesn't already exist
        if not company_exists(company):
            data.append(company)
    else:
        print(f"No jumbotron found for URL: {url}")

def get_company_links(page):
    """
    Fetch all company links from the given page number.
    Returns a list of URLs.
    """
    response = requests.get(f"{URL}?page={page}", headers=HEADERS)
    print(f"Fetching links from page {page}, Status code: {response.status_code}")
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract the href attribute of each company link
    links = [a['href'] for a in soup.select('.search-results a')]
    print(f"Links found: {links}")
    
    return links


In [13]:
def main():
    """
    Main function that orchestrates the scraping process.
    Loads existing data, fetches company links, scrapes company data, and writes to an Excel file.
    """
    global data
    data = init_data()  # Load existing data
    
    # Loop through pages to scrape company links and data
    for page in range(START_PAGE, 5):  # Get company that recently added 
        print(f"Processing page {page}")
        links = get_company_links(page)
        
        for link in links:
            get_company_data(link)
        
        write_file()  # Save the data to the Excel file after processing each page

# Call the main function to start the scraping process
main()


Processing page 1
Fetching links from page 1, Status code: 200
Links found: ['https://www.tratencongty.com/company/1aab1763b-cong-ty-tnhh-goldsea-group/', 'https://www.tratencongty.com/company/1aab1763b-cong-ty-tnhh-goldsea-group/', 'https://www.tratencongty.com/company/11bb65ffc-cong-ty-tnhh-ky-thuat-dien-phuoc-long-bp/', 'https://www.tratencongty.com/company/11bb65ffc-cong-ty-tnhh-ky-thuat-dien-phuoc-long-bp/', 'https://www.tratencongty.com/company/11bb65ff5-cong-ty-tnhh-san-xuat-tm-dv-thanh-thien-minh/', 'https://www.tratencongty.com/company/11bb65ff5-cong-ty-tnhh-san-xuat-tm-dv-thanh-thien-minh/', 'https://www.tratencongty.com/company/11bb65fee-cong-ty-tnhh-dau-tu-phat-trien-giao-duc-dat-viet/', 'https://www.tratencongty.com/company/11bb65fee-cong-ty-tnhh-dau-tu-phat-trien-giao-duc-dat-viet/', 'https://www.tratencongty.com/company/b6591299-cong-ty-tnhh-giay-da-an-phat/', 'https://www.tratencongty.com/company/b6591299-cong-ty-tnhh-giay-da-an-phat/', 'https://www.tratencongty.com/com

KeyboardInterrupt: 