# Google Search Engine Scrape using SERP API

In [1]:
# Importing Libraries

import numpy as np
import pandas as pd
import traceback
from serpapi import GoogleSearch

In [3]:
# Filtering Dataframe that do not have a website URL but do contain
# State, Profession and Place type information

dataframe = pd.read_csv("input_data.csv",usecols=["infogroup_id","name","place_type","phone","street","city","labels.state","website","labels.primary_sic_code_id","labels.location_parent_relationship"])

dataframe = dataframe[dataframe['website'].isna()]

filtered_dataframe = dataframe[(~dataframe['labels.state'].isna()) & (~dataframe['labels.primary_sic_code_id'].isna()) & (~dataframe['place_type'].isna())]

filtered_dataframe.reset_index(inplace = True, drop = True)

filtered_dataframe.head()

Unnamed: 0,infogroup_id,place_type,name,street,city,labels.state,labels.location_parent_relationship,labels.primary_sic_code_id,phone,website
0,538104035,independent,Wachter's Chair Caning & Furniture,615 Gear St,Galena,Illinois,Primary Business,Chair Caning,(815) 777-0945,
1,718638652,independent,Always There Homecare-the Palm,,Atlantis,Florida,Primary Business,Health Services,(561) 570-6129,
2,742117988,individual,"O'Neill, Christopher",2790 N Academy Blvd,Colorado Springs,Colorado,Professional Individual,Social Workers,(719) 444-0250,
3,746285930,independent,Secuepay Inc,30 Pecan Course Loop,Ocala,Florida,Primary Business,Nonclassified Establishments,,
4,713083515,independent,Dandy Handyman Service,255 Wavetree Dr,Roswell,Georgia,Primary Business,Miscellaneous Personal Services NEC,(770) 518-7677,


In [4]:
# Selecting a random sample of length 50 out of the filtered dataframe 
# to test it on SERP API

filtered_dataframe = filtered_dataframe[filtered_dataframe['place_type'] == 'individual'].sample(50)

In [5]:
filtered_dataframe = filtered_dataframe[filtered_dataframe.name.str.contains('.*,.*', regex= True, na=False)]
filtered_dataframe

Unnamed: 0,infogroup_id,place_type,name,street,city,labels.state,labels.location_parent_relationship,labels.primary_sic_code_id,phone,website
5704,741974114,individual,"Bangura, Doris",,Hurst,Texas,Primary Business,Nurses-Practitioners,(817) 690-5032,
39813,760835559,individual,"Stevelinck, Sydney",1011 Horizon Dr,Lyons,Colorado,Primary Business,Physical Therapists,(586) 569-1378,
1624,725724002,individual,"Schenne, Jennifer B DO",7700 E Florentine Rd,Prescott Valley,Arizona,Professional Individual,Physicians & Surgeons,(928) 442-8710,
60783,769685384,individual,"Pinto, Nasstasijia, Noelia",78A N Boundary Rd,Jamaica,New York,Professional Individual,Nurses-Practitioners,(718) 656-1245,
31416,756815222,individual,"Kremer, Alexander",1300 Industrial Blvd,Southampton,Pennsylvania,Primary Business,Acupuncture,(267) 753-5337,
73341,744404996,individual,"Gunderson, Kristen",,Orlando,Florida,Primary Business,Physical Therapists,(281) 299-7505,
6087,739477242,individual,"Gallagher, Seema",3518 Highway 153,Greenville,South Carolina,Primary Business,Pharmacists,(864) 587-9486,
56189,753887537,individual,"Schumacher, Ross MD",1720 2nd Ave S,Birmingham,Alabama,Primary Business,Physicians & Surgeons,(205) 996-5864,
44796,752750768,individual,"Amipara, Ravindrakum",3201 Malcolm X Blvd,Dallas,Texas,Primary Business,Pharmacists,(214) 516-4871,
32337,764223262,individual,"Katkov, William, Michael",3420 Kenyon St,San Diego,California,Primary Business,Social Workers,(619) 221-6393,


In [8]:
# Scraper Class which scrapes search engine 
# to fetch website URL based on the given query
# using SERP API

class SearchEngineScraper:
    
    def __init__(self):
        
        self.searchParams = {
            "engine": "google",
            "q": "",
            "location": "",
            "google_domain": "google.com",
            "gl": "us",
            "hl": "en",
            "api_key": "69190367cce22584a14d6cfe1de75811df9bfa1bab44e4a655b53ffe3eb7a279"
        }
        

    def google_scrape(self, query: str, location: str) -> str:
        
        self.searchParams['q'] = query
        self.searchParams['location'] = location

        client = GoogleSearch(self.searchParams)
        results = client.get_dict()
        organic_results = results["organic_results"]
        
        return organic_results[0]['link']


In [9]:

scraper = SearchEngineScraper()

for i,row in filtered_dataframe.iterrows():
    
    query = row['name'] + " " +  row['labels.primary_sic_code_id'] + " " + 'Linkedin'
    location = row['city']
    
    try:
        linkedin_profile_link = scraper.google_scrape(query,location)
        print(linkedin_profile_link)
    except:
        traceback.print_exc()
        break
    

https://serpapi.com/search
https://www.linkedin.com/in/doris-bangura-a52926a8
https://serpapi.com/search
https://www.linkedin.com/in/sydney-stevelinck
https://serpapi.com/search
https://www.linkedin.com/in/jennifer-to-4175b913b
https://serpapi.com/search
https://www.linkedin.com/company/nurse-practitioner
https://serpapi.com/search
https://www.linkedin.com/in/alexander-sasha-kremer-92aa7aa2
https://serpapi.com/search


Traceback (most recent call last):
  File "C:\Users\apidap\AppData\Local\Temp\ipykernel_24508\240206763.py", line 9, in <cell line: 3>
    linkedin_profile_link = scraper.google_scrape(query,location)
  File "C:\Users\apidap\AppData\Local\Temp\ipykernel_24508\3443309870.py", line 26, in google_scrape
    results = client.get_dict()
  File "C:\Users\apidap\AppData\Local\Programs\Python\Python39\lib\site-packages\serpapi\serp_api_client.py", line 103, in get_dict
    return self.get_dictionary()
  File "C:\Users\apidap\AppData\Local\Programs\Python\Python39\lib\site-packages\serpapi\serp_api_client.py", line 96, in get_dictionary
    return dict(self.get_json())
  File "C:\Users\apidap\AppData\Local\Programs\Python\Python39\lib\site-packages\serpapi\serp_api_client.py", line 83, in get_json
    return json.loads(self.get_results())
  File "C:\Users\apidap\AppData\Local\Programs\Python\Python39\lib\site-packages\serpapi\serp_api_client.py", line 70, in get_results
    return self.get_resp