In [2]:
import sys
import os

# In Jupyter, __file__ is not defined. Use the current working directory instead.
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [3]:
from util.fetch import fetch_url
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
base_url = "https://ssisports.net"

In [5]:
url = "https://ssisports.net/Categories/Index"

content = fetch_url(url)

Status Code: 200


In [6]:
soup = BeautifulSoup(content, 'html.parser')

In [7]:
categories = soup.find_all(class_='brand-section')

len(categories)

16

In [8]:
category_list = []

for category in categories:
    cat = category.find('h1').text.strip()
    sub_cats_html = category.find_all(class_='brand-section-brand')

    sub_cat_list = []
    for sub_cat in sub_cats_html:
        a_tag = sub_cat.find('a')
        if a_tag:
            sub_cat_name = a_tag.text.strip()
            sub_cat_url = a_tag['href']
            # print(sub_cat_name, sub_cat_url)
            sub_cat_list.append({
                "category": cat,
                "sub_category": sub_cat_name,
                "url": sub_cat_url
            })
    category_list.extend(sub_cat_list)
    print(f"Collected {len(sub_cat_list)} from {cat}")

Collected 8 from Apparel
Collected 6 from Archery
Collected 8 from Camping and Outdoor
Collected 9 from Fishing
Collected 3 from Gifts and Novelty
Collected 4 from Golf
Collected 0 from Houseware
Collected 7 from Hunting
Collected 6 from Knives and Tools
Collected 3 from Lights
Collected 4 from Marine and Water Sports
Collected 1 from Nightvision and Thermal
Collected 10 from Optics and Sights
Collected 3 from Public Safety and LE
Collected 31 from Shooting
Collected 2 from Sports


In [12]:
from util.file import save_json_to_dir, load_json_from_dir

In [11]:
save_json_to_dir(category_list, "ssi.json")

In [14]:
loaded=load_json_from_dir("ssi.json")

category_list == loaded

True

In [9]:
from scraper.ssi import scrape_ssi

In [10]:
dfs = {}
for i, c in enumerate(category_list,1):
    # print(i)
    print(c)
    url = f"{base_url}{c.get('url')}"
    df = scrape_ssi(url)
    df['category'] = c.get('category')
    df['sub_category'] = c['sub_category'].split('(')[0].strip()
    dfs[i] = df
    # break



{'category': 'Apparel', 'sub_category': 'Bottoms (1)', 'url': '/Categories/List/Bottoms'}
Status Code: 200
Product 1:
  Name: 5.11 Tactical Stryke Pant w/Flex-Tac -TDU Green  Size 32-32
  SKU: 1108232
{'category': 'Apparel', 'sub_category': 'Eyewear - Accessories (12)', 'url': '/Categories/List/Eyewear-Accessories'}
Status Code: 200
Product 1:
  Name: ESS Crosshair Replacement Lens Smoke Gray
  SKU: 1121604
Product 2:
  Name: ESS Nomex Heat Sleeve
  SKU: 1121609
Product 3:
  Name: ESS Crossbow Replacement Lens Hi-Def Yellow
  SKU: 1121597
Product 4:
  Name: ESS Eyewear Profile NVG Hi-Def Yelllow Rep Lens 740-0121
  SKU: 311926
Product 5:
  Name: ESS Eyewear ICE NARO Frame and Nosepiece Kit 740-0083
  SKU: 312107
Product 6:
  Name: ESS CrossBlade ONE Eyeshield Smoke Gray
  SKU: 1121612
Product 7:
  Name: ESS Eyewear Ice 2X Eyeshield Kit 740-0003
  SKU: 312428
Product 8:
  Name: ESS CDI Replacement Lens Smoke Gray
  SKU: 1121608
Product 9:
  Name: ESS Crossbow Gasket
  SKU: 1121593
ERRRO

In [11]:
type(dfs),type(dfs[1])

(dict, pandas.core.frame.DataFrame)

In [12]:
merged_df1 = pd.concat([df for df in dfs.values()], ignore_index=True)

len(merged_df1)
merged_df1.head()

Unnamed: 0,SKU,Name,Description,Price,Stock,Link,category,sub_category
0,1108232,5.11 Tactical Stryke Pant w/Flex-Tac -TDU Gree...,"Durable, flexible professional cargo pants\nCo...",$79.99,1,https://ssisports.net/Products/346397,Apparel,Bottoms
1,1121604,ESS Crosshair Replacement Lens Smoke Gray,VLT: 15 percent\nLens tint: Smoke Grey\nInclud...,$20.00,18,https://ssisports.net/Products/439637,Apparel,Eyewear - Accessories
2,1121609,ESS Nomex Heat Sleeve,Heat resistant Nomex\nProtects goggle\nScratch...,$25.00,6,https://ssisports.net/Products/439642,Apparel,Eyewear - Accessories
3,1121597,ESS Crossbow Replacement Lens Hi-Def Yellow,Includes mirco fiber storage sleeve\nLens tint...,$30.00,6,https://ssisports.net/Products/439631,Apparel,Eyewear - Accessories
4,311926,ESS Eyewear Profile NVG Hi-Def Yelllow Rep Len...,Hi-Def Yellow Lens\n2.8mm\n100% UVA/UVB protec...,$22.00,5,https://ssisports.net/Products/73681,Apparel,Eyewear - Accessories


In [13]:
merged_df = pd.concat(dfs.values(), ignore_index=True)

len(merged_df)
merged_df.head()

Unnamed: 0,SKU,Name,Description,Price,Stock,Link,category,sub_category
0,1108232,5.11 Tactical Stryke Pant w/Flex-Tac -TDU Gree...,"Durable, flexible professional cargo pants\nCo...",$79.99,1,https://ssisports.net/Products/346397,Apparel,Bottoms
1,1121604,ESS Crosshair Replacement Lens Smoke Gray,VLT: 15 percent\nLens tint: Smoke Grey\nInclud...,$20.00,18,https://ssisports.net/Products/439637,Apparel,Eyewear - Accessories
2,1121609,ESS Nomex Heat Sleeve,Heat resistant Nomex\nProtects goggle\nScratch...,$25.00,6,https://ssisports.net/Products/439642,Apparel,Eyewear - Accessories
3,1121597,ESS Crossbow Replacement Lens Hi-Def Yellow,Includes mirco fiber storage sleeve\nLens tint...,$30.00,6,https://ssisports.net/Products/439631,Apparel,Eyewear - Accessories
4,311926,ESS Eyewear Profile NVG Hi-Def Yelllow Rep Len...,Hi-Def Yellow Lens\n2.8mm\n100% UVA/UVB protec...,$22.00,5,https://ssisports.net/Products/73681,Apparel,Eyewear - Accessories


In [18]:
from util.gsheet import update_sheet,list_available_spreadsheets,update_sheet_by_id


In [22]:
list_available_spreadsheets()

Available spreadsheets:
1. AutoProductList (ID: 1ZE7BeE26iM43x_ympaIgOdl6lXWrwXg-PZZAKuIBEIw)


[<Spreadsheet 'AutoProductList' id:1ZE7BeE26iM43x_ympaIgOdl6lXWrwXg-PZZAKuIBEIw>]

In [23]:
from util.gsheet import update_sheet_by_id

sheet_id = "1ZE7BeE26iM43x_ympaIgOdl6lXWrwXg-PZZAKuIBEIw"
sheet_name = "Sheet20"

update_sheet_by_id(merged_df, sheet_id, sheet_name)

Data successfully uploaded to Google Sheets!


In [None]:
# for category in categories:
#     if "Shooting" in category.find('h1').text:
#         print(category.text)
#         break









In [None]:
# cat = category.find('h1').text.strip()
# sub_cats_html = category.find_all(class_='brand-section-brand')

# for sub_cat in sub_cats_html:
#     a_tag = sub_cat.find('a')
#     if a_tag:
#         sub_cat_name = a_tag.text.strip()
#         sub_cat_url = a_tag['href']
#         print(sub_cat_name, sub_cat_url)