In [31]:
import requests
import re
import bs4
from bs4 import BeautifulSoup

In [32]:
HEADER = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/139.0.0.0 Safari/537.36"
}

BASE_URL = "https://www.gsmarena.com/"

In [35]:
all_brand_url = BASE_URL + "makers.php3"

brand_page = requests.get(url=all_brand_url, headers=HEADER)
brand_page.close()

In [None]:
brand_soup = BeautifulSoup(brand_page.text, features="html.parser")
print(brand_soup.prettify())

# Inspection Result
- all the brands are stored in div class="st-text" within a table
- brands are stored as rows of two columns
- each brand is stored in a cell (td tag)
- each cell contains information such as:
    - link to brand page
    - name of phone
    - quantity

In [None]:
table_elements = brand_soup.find_all("table")[0]
td_elements = table_elements.findAll("td")

PATTERN = r">([^<]+)<br/><span>(\d+).+</span>"

all_brands = {}

for elm in td_elements:
    if (type(elm) != bs4.element.Tag):
        continue
    elm_match = re.search(PATTERN, str(elm))
    
    if elm_match is None:
        print(f"Error: couldnt find match:\n{elm}")

    link = elm.find("a").attrs["href"]
    brand, quantity = elm_match.groups()

    all_brands[brand] = {"url": link, "quantity": int(quantity)}
    print(f"{brand}: {link=}, {quantity=}")


Acer: link='acer-phones-59.php', quantity='104'
alcatel: link='alcatel-phones-5.php', quantity='413'
Allview: link='allview-phones-88.php', quantity='157'
Amazon: link='amazon-phones-76.php', quantity='25'
Amoi: link='amoi-phones-28.php', quantity='47'
Apple: link='apple-phones-48.php', quantity='138'
Archos: link='archos-phones-90.php', quantity='43'
Asus: link='asus-phones-46.php', quantity='207'
AT&amp;T: link='at&t-phones-57.php', quantity='4'
Benefon: link='benefon-phones-15.php', quantity='9'
BenQ: link='benq-phones-31.php', quantity='35'
BenQ-Siemens: link='benq_siemens-phones-42.php', quantity='28'
Bird: link='bird-phones-34.php', quantity='61'
BlackBerry: link='blackberry-phones-36.php', quantity='92'
Blackview: link='blackview-phones-116.php', quantity='99'
BLU: link='blu-phones-67.php', quantity='369'
Bosch: link='bosch-phones-10.php', quantity='10'
BQ: link='bq-phones-108.php', quantity='20'
Casio: link='casio-phones-77.php', quantity='5'
Cat: link='cat-phones-89.php', quan

In [None]:
brand_phone_url = BASE_URL + link

phone_page = requests.get(brand_phone_url, headers=HEADER)
phone_page.close()

phone_soup = BeautifulSoup(phone_page.text)
print(phone_soup.prettify())

# Inspection Results
- Brand phone makes
    - contained as a-tags in the "makers" class div
    - contains the name of the make and link for each individual phone
- Brand Page navigation
    - contained as a-tags in the 'nav-pages' class div
    - contains pages + prev + next

In [65]:
# navigation buttons - div class="nav-pages"
nav_elm = phone_soup.find("div", attrs={'class': 'nav-pages'}).find_all("a")
for page_num in nav_elm:
    if page_num.text.isnumeric():
        print(f"Page {page_num.text} - {page_num.attrs["href"]}")

Page 2 - zte-phones-f-62-0-p2.php
Page 3 - zte-phones-f-62-0-p3.php
Page 4 - zte-phones-f-62-0-p4.php
Page 5 - zte-phones-f-62-0-p5.php
Page 6 - zte-phones-f-62-0-p6.php
Page 7 - zte-phones-f-62-0-p7.php
Page 8 - zte-phones-f-62-0-p8.php
Page 9 - zte-phones-f-62-0-p9.php


In [67]:
# all phones - div class="makers"
phone_elm = phone_soup.find("div", attrs={"class": "makers"}).find_all("li")

for model_elm in phone_elm:
    make = model_elm.text
    url = model_elm.find("a").attrs["href"]
    print(f"{brand}: {make=}, {url=}")
    break

ZTE: make='Blade A36', url='zte_blade_a36-13966.php'


In [None]:
# number of requests if only brands are looked at
import math

num_pages = 1

for k, v in all_brands.items():
    num_pages += math.ceil(float(v["quantity"])/50)

print(num_pages)

351


In [71]:
# number of requests if brands and make data are extracted
total_req = 1 # maker page

for k, v in all_brands.items():
    total_req += math.ceil(float(v["quantity"])/50) # add visiting each brands pages
    total_req += v["quantity"] # add visiting each makes page

print(total_req)


14341


# Plan
1. Go to the main page (makers.php3)
    - get all the brands and their urls
2. For all the brands and urls
    - go to page 1
        - get the list of all pages
    - get all makes from page and get their url
    - traverse all the pages