# Search Engine Scraping 

In [None]:
# Importing Libraries

import numpy as np
import pandas as pd
import traceback
from lxml import html, etree
from io import StringIO
import re
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import traceback
from selenium.webdriver.common.by import By

In [None]:
# Filtering Dataframes that do not have a website URL but do contain a phone number

dataframe = pd.read_csv("input_data.csv",usecols=["infogroup_id","name","phone","street","city","labels.state","website","labels.location_parent_relationship"])

dataframe = dataframe[dataframe['website'].isna()]

filtered_dataframe = dataframe[(~dataframe['phone'].isna())]

filtered_dataframe.reset_index(inplace = True, drop = True)

In [None]:
# Scraper Class which scrapes search engine 
# to fetch website URL based on the given query

class SearchEngineScraper:
    
    def __init__(self):
        
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)
        

    def google_scrape(self, query: str) -> str:

        url = "https://www.google.com/search?q="+ query + "&gl=us&hl=en&pws=0"

        self.driver.get(url)

        urlsXpath = '//div[@class="yuRUbf"]/a[@href]'

        linkElements = self.driver.find_elements(By.XPATH, urlsXpath)

        links = []

        for linkElement in linkElements:
            links.append(linkElement.get_attribute("href"))

        return links[0]
    
    
    def duckduckgo_scrape(self, query: str) -> str:

        url = "https://duckduckgo.com/?q="+ query + "&gl=us&hl=en&pws=0"

        self.driver.get(url)

        urlsXpath = '//a[@data-testid = "result-extras-url-link"]'

        linkElements = self.driver.find_elements(By.XPATH, urlsXpath)

        links = []

        for linkElement in linkElements:
            links.append(linkElement.get_attribute("href"))

        return links[0]

    

In [None]:
# Google Search Engine scraping based on a query (Name + Phone)

scraper = SearchEngineScraper()

for i,row in filtered_dataframe.iterrows():

    query = str(row['name']) + " " + str(row['phone'])
    
    try:
        website_link = scraper.google_scrape(query)
        filtered_dataframe.loc[i, 'website'] = website_link
    except:
        traceback.print_exc()
        break

filtered_dataframe


In [None]:
# DuckDuckGo Search Engine scraping based on a query (Name + Phone)

scraper = SearchEngineScraper()

for i,row in filtered_dataframe.iterrows():

    query = str(row['name']) + " " + str(row['phone'])
    
    try:
        website_link = scraper.duckduckgo_scrape(query)
        filtered_dataframe.loc[i, 'website'] = website_link
    except:
        traceback.print_exc()
        break

filtered_dataframe