In [None]:
import pandas as pd
import requests
import tldextract # requires !pip install
import re

In [None]:
!pip install tldextract

Collecting tldextract
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.3.0-py3-none-any.whl (107 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/107.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.3.0


In [None]:
# Initialize dataframe for web scraping result
# Scrape from google and get the results
api_key = "9f4f3575b14a2367bcaf54047f46410b39ec58246dc60b149f4c478de2ea1dae"
query = "implementation of artificial intelligence"

df = pd.DataFrame(columns=[
    'Company',
    'Domain',
    'Industry',
    'Keywords',
]) # reinitialize data frame

for start in range(0, 100, 10): #scrape amount
  params = {
      "engine": "google",
      "q": query,
      "start": start,
      "api_key": api_key
  }

  response = requests.get("https://serpapi.com/search", params=params)
  results = response.json().get("organic_results", [])

# Fill DataFrame
  for res in results:
    title = res.get("title")
    link = res.get("link")
    if title and link:
      fill_df = pd.DataFrame([{
          'Company': title,
          'Domain': link,
          'Industry': '',
          'Keywords': '',
      }])
      df = pd.concat([df, fill_df], ignore_index= True)


In [None]:
display(df)

Unnamed: 0,Company,Domain,EmployeeCount,NetWorth,Industry,Keywords
0,The Complete List of American Made Clothing Br...,https://toddshelton.com/blog/about-todd-shelto...,,,,
1,All American Clothing: All American Made Clothing,https://www.allamericanclothing.com/?srsltid=A...,,,,
2,The Normal Brand | Elevated clothing that fits...,https://thenormalbrand.com/?srsltid=AfmBOop3ME...,,,,
3,10 American Made Clothing Brands For Women (2025),https://www.thegoodtrade.com/features/american...,,,,
4,Category:Clothing brands of the United States,https://en.wikipedia.org/wiki/Category:Clothin...,,,,
...,...,...,...,...,...,...
94,100 Top Apparel Companies in United States · J...,https://www.f6s.com/companies/apparel/united-s...,,,,
95,"New York & Company | Women's Clothes: Dresses,...",https://www.nyandcompany.com/,,,,
96,Harvest & Mill | organic cotton clothing | gro...,https://harvestandmill.com/?srsltid=AfmBOoppD6...,,,,
97,"Carhartt: Durable Workwear, Outdoor Apparel & ...",https://www.carhartt.com/,,,,


In [None]:
# Filter third party / aggregator websites
third_party_indicator = [
    "top", "best", "leading", "directory", "review", "compare", "list",
    "ranking", "companies", "agencies", "firms", "vendors", "providers",
    "expert", "consultant", "outsource", "services", "evaluations", "insights",
    "buyers-guide","blog","wikipedia","how","developers","linkedin","work","year",
    "country","what","where","who","why","when","guide","news","research","report","insight",
    "magazine","travel","looking","fandom","category","directory","news","website","journal"
    "group","agency","paper","article","instagram","facebook","site","tiktok","video",
    "youtube","reddit"
]

def company_website(domain, company):
  domain = str(domain).lower()
  company = str(company).lower()

  for tp in third_party_indicator:
    if tp in domain or tp in company:
      return False
  return True

df = df[df.apply(lambda row: company_website(row['Company'], row['Domain']), axis =1)].reset_index(drop=True)

# Filter out duplicate domain
df = df.drop_duplicates(subset=['Company'], keep='first').reset_index(drop=True)
df = df.drop_duplicates(subset=['Domain'], keep='first').reset_index(drop=True)

In [None]:
display(df)

Unnamed: 0,Company,Domain,EmployeeCount,NetWorth,Industry,Keywords
0,All American Clothing: All American Made Clothing,https://www.allamericanclothing.com/?srsltid=A...,,,,
1,The Normal Brand | Elevated clothing that fits...,https://thenormalbrand.com/?srsltid=AfmBOop3ME...,,,,
2,10 American Made Clothing Brands For Women (2025),https://www.thegoodtrade.com/features/american...,,,,
3,All USA Clothing | Union and American Made Clo...,https://allusaclothing.com/?srsltid=AfmBOoq3L0...,,,,
4,American Made Clothing : r/madeinusa,https://www.reddit.com/r/madeinusa/comments/17...,,,,
5,American Giant: American Made Clothing & Activ...,https://www.american-giant.com/?srsltid=AfmBOo...,,,,
6,"Women's, Men's and Kids' Clothing & Accessorie...",https://www.uniqlo.com/us/en/?srsltid=AfmBOoqj...,,,,
7,Reformation: Sustainable Women's Clothing and ...,https://www.thereformation.com/?srsltid=AfmBOo...,,,,
8,Talbots: Women's Clothing & Apparel,https://www.talbots.com/?srsltid=AfmBOoo4pabi8...,,,,
9,boohoo USA | Womens and Mens Clothes | Shop On...,https://us.boohoo.com/?srsltid=AfmBOoo8Rsx9j3I...,,,,


In [None]:
# Replace site Titles with exact company name for better presentation
def get_company_name(url):
  try:
    extracted = tldextract.extract(url)
    domain_part = extracted.domain
    return domain_part.capitalize()
  except:
    return None

# apply found name
df['Company'] = df['Domain'].apply(get_company_name)

# Clean domain name
df['Domain'] = df['Domain'].str.replace(r'\?.*', '', regex=True)


In [None]:
display(df)

Unnamed: 0,Company,Domain,Industry,Keywords
0,Nejm,https://ai.nejm.org/doi/full/10.1056/AIp2400223,,
1,Jmir,https://www.jmir.org/2019/7/e13659/,,
2,Synergise-ai,https://www.synergise-ai.com/resources/everyth...,,
3,Aasmr,https://www.aasmr.org/jsms/Vol9/2019no.3.9.pdf,,
4,Techtarget,https://www.techtarget.com/searchenterpriseai/...,,
5,Uschamber,https://www.uschamber.com/chambers-of-commerce...,,
6,Rtslabs,https://rtslabs.com/12-steps-to-implement-ai-s...,,
7,Defense,https://media.defense.gov/2022/Jun/22/20030226...,,
8,Mdpi,https://www.mdpi.com/2673-2688/1/2/11,,
9,Marines,https://www.marines.mil/Portals/1/Publications...,,


<a class="total-mcap-link" href="/total-marketcap/">$118.283 T</a>
Requesting: https://companiesmarketcap.com/total-marketcap/
Status: 200
<a class="total-mcap-link" href="/total-marketcap/">$118.283 T</a>
Requesting: https://companiesmarketcap.com/total-marketcap/
Status: 200
<a class="total-mcap-link" href="/total-marketcap/">$118.283 T</a>
Requesting: https://companiesmarketcap.com/total-marketcap/
Status: 200
<a class="total-mcap-link" href="/total-marketcap/">$118.283 T</a>
Requesting: https://companiesmarketcap.com/total-marketcap/
Status: 200
<a class="total-mcap-link" href="/total-marketcap/">$118.283 T</a>
Requesting: https://companiesmarketcap.com/total-marketcap/
Status: 200
<a class="total-mcap-link" href="/total-marketcap/">$118.283 T</a>
Requesting: https://companiesmarketcap.com/total-marketcap/
Status: 200
<a class="total-mcap-link" href="/total-marketcap/">$118.283 T</a>
Requesting: https://companiesmarketcap.com/total-marketcap/
Status: 200
<a class="total-mcap-link" 

Unnamed: 0,Company,Domain,Stock,Industry,Keywords
0,Booking,https://www.booking.com/city/sg/singapore.html,,,
1,Yotel,https://www.yotel.com/en/hotels/yotel-singapore,,,
2,Ritzcarlton,https://www.ritzcarlton.com/en/hotels/sinrz-th...,,,
3,Msocial,https://www.msocial.com/en/singapore/m-social-...,,,
4,Marinabaysands,https://www.marinabaysands.com/,,,
5,Marriott,https://www.marriott.com/en-us/hotels/sinxr-th...,,,
6,Fullertonhotels,https://www.fullertonhotels.com/fullerton-hote...,,,
7,Raffles,https://www.raffles.com/singapore/,,,
8,Marriott,https://www.marriott.com/en-us/hotels/sinjw-jw...,,,
9,Expedia,https://www.expedia.com/Destinations-In-Singap...,,,
