<h1>web-scraper-BGMEA-Contact</h1>
<p>A Python script to scrape the BGMEA member list from their website. It uses Selenium, Requests, BeautifulSoup, and Pandas.<br> The output is a CSV file with various information about each member company. For educational purposes only.</p>

<h3>Features</h3>
<p>Scrapes the name, registration number, contact person, email address, and mobile number of each member company from the BGMEA website.<br>
Accesses the individual member page for each company and extracts additional information from the table element.<br>
Handles exceptions such as missing table element or timeout error using try-except blocks.<br>
Saves the scraped data in a CSV file with appropriate column names.<br><p>

<h3>Requirements</h3>
Python 3.x<br>
Selenium<br>
Requests<br>
BeautifulSoup<br>
Pandas<br>

<h3>Usage</h3>
<p>Clone or download this repository to your local machine.<br>
Install the required libraries using pip install -r requirements.txt.<br>
Open the scraper.py file and change the num_pages variable to the number of pages you want to scrape. The default value is 370,<br> which corresponds to the total number of pages on the BGMEA website as of September 2023.<br>
Run the script using python scraper.py.<br>
Wait for the script to finish scraping and check the out.csv file for the output.<br></p>

In [None]:
#
import warnings
warnings.filterwarnings("ignore")
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
# Import TimeoutException and StaleElementReferenceException
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException

# Create an empty DataFrame to store the data
df = pd.DataFrame(columns=['Member/Company Name', 'BGMEA Reg No', 'Contact Person', 'Email Address', 'Mobile'])
# Define the base URL
base_url = 'https://bgmea.com.bd/page/member-list?page='
# Define the number of pages to scrape
num_pages = 370 # You can change this to any number you want

# Loop over the page numbers
for i in range(1, num_pages + 1):
    # Append the page number to the base URL
    url = base_url + str(i)
    # Use Selenium to get the HTML content of the page
    driver = webdriver.Chrome()
    driver.get(url)
    # Wait for the table to load
    table = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, '.table'))
    )
    
    #table = WebDriverWait(driver, 10).until(
    #EC.presence_of_element_located((By.CSS_SELECTOR, '.table'))
    #)
    #
    # Get the current window handle
    window_before = driver.current_window_handle
    # Loop through each row in the table
    for j, row in enumerate(table.find_elements(By.CSS_SELECTOR, 'tbody tr'), start=1):
        # Use a try-catch block to handle exceptions
        try:
            # Extract the text from each cell using explicit wait and a different locator
            name = row.find_element(By.XPATH, './td[1]').text
            reg = row.find_element(By.XPATH, './td[2]').text
            contact = row.find_element(By.XPATH, './td[3]').text
            email = row.find_element(By.XPATH, './td[4]').text
            
            
            # Access another site with the j variable
            url = "https://www.bgmea.com.bd/member/"+str(j)
            response = requests.get(url)
            soup = BeautifulSoup(response.text, "html.parser")
            
            # Open a new window or tab with the url
            driver.execute_script("window.open('');")
            
            # Switch to the new window or tab
            driver.switch_to.window(driver.window_handles[1])
            
            # Go to the url
            driver.get(url)
            
            # Find the table element by its class name
            try:
                table=soup.find("table", class_="table-bordered")
                
                # Find all the rows in the table
                rows = table.find_all("tr")
                
                # Create an empty list to store the data
                data = []
                
                # Loop through each row
                for row in rows:
                    # Find all the cells in the row
                    cells = row.find_all("td")
                    # If there are four cells in the row
                    if len(cells) == 4:
                        # Extract the position, name, mobile number, and email from each cell
                        mobile = cells[2].text.strip()
                        
                        # Append a tuple of the data to the list
                        data.append((mobile))
                        
                # Print the list of data
                #print(data)
                
            except AttributeError:
                # Skip the web page if the table element is not found
                print("No table element found for url: " + url)
                continue
            
            # Close the current window or tab
            driver.close()
            
            # Switch back to the original window or tab
            driver.switch_to.window(window_before)
            
            # Merge data and df
            df = df.append({'Member/Company Name': name, 'BGMEA Reg No': reg, 'Contact Person': contact, 'Email Address': email, 'Mobile': data}, ignore_index=True)    
        
        except (TimeoutException, StaleElementReferenceException):
            # Retry finding or interacting with the element if an exception occurs
            print("Exception occurred for url: " + url)
            continue
    
    # Quit the browser after scraping one page
    driver.quit()

# Print or save df as you wish            
print(df)

# Save the DataFrame as a CSV file
df.to_csv('bgmeaContact.csv', index=False)