## Scraping Category Links


In [2]:
import requests
from bs4 import BeautifulSoup
import json

url = "https://foreignfortune.com"

response = requests.get(url)

category_dict = {}
if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")
    links = soup.select('#SiteNav li a')
    for link in links:
        category_name = link.text.strip()
        category_link = "https://foreignfortune.com" + link['href']
        category_dict[category_name] = category_link
    
    print(category_dict)
    
else:
    print("Failed to retrieve the website.")


{'Men/Unisex': 'https://foreignfortune.com/collections/men-unisex', 'Women': 'https://foreignfortune.com/collections/women', 'Infant/Kid': 'https://foreignfortune.com/collections/kids', 'Coats/Hats': 'https://foreignfortune.com/collections/coats-hats', 'TrackSuits': 'https://foreignfortune.com/collections/small-logo-embroidery-t-shirts-1', 'Foreign Rovalf': 'https://foreignfortune.com/collections/frontpage', 'Accessories': 'https://foreignfortune.com/collections/foreign-accesories'}


In [3]:
def get_links_from_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        links = soup.select('#Collection .grid__item.grid__item--collection-template .grid-view-item__link')
        
        link_urls = ["https://foreignfortune.com" + link['href'] for link in links]
        
        return link_urls
    else:
        print("Failed to retrieve page:", response.status_code)
        return []

def get_all_links(base_url):
    all_links = []
    page_number = 1
    
    while True:
        url = f"{base_url}?page={page_number}"
        links_on_page = get_links_from_page(url)
        
        if not links_on_page:
            break
        
        all_links.extend(links_on_page)
        page_number += 1
    
    return all_links

## Scraping PDP's Links in each Cateogry


In [5]:
category_wise_dict={}
count=0
for links in category_dict:
    count+=1
    links_list=get_all_links(category_dict[links])
    category_wise_dict[links]=links_list
print(category_wise_dict)
# print(count)
    

{'Men/Unisex': ['https://foreignfortune.com/collections/men-unisex/products/foreign-fortune-collection-joggers-1', 'https://foreignfortune.com/collections/men-unisex/products/foreign-rovalf-outfit', 'https://foreignfortune.com/collections/men-unisex/products/forign-luxury-tracksuits', 'https://foreignfortune.com/collections/men-unisex/products/ff-coats-w-hats', 'https://foreignfortune.com/collections/men-unisex/products/foreign-language-hoodie', 'https://foreignfortune.com/collections/men-unisex/products/detroit-foreign-everybody-tees', 'https://foreignfortune.com/collections/men-unisex/products/foreign-fortune-collection-short-sets', 'https://foreignfortune.com/collections/men-unisex/products/long-socks-muti-color', 'https://foreignfortune.com/collections/men-unisex/products/foreign-pattern-logo-socks', 'https://foreignfortune.com/collections/men-unisex/products/foreign-fortune-socks-1', 'https://foreignfortune.com/collections/men-unisex/products/long-foreign-socks', 'https://foreignf

## PDP Data Parsing

In [6]:
import requests
import re
import json
from bs4 import BeautifulSoup

def scrape_meta_json(url,category):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            script_tag = soup.find("script", {"id": "ProductJson-product-template"})
            if script_tag:
                json_string = script_tag.string
                product_json = json.loads(json_string)
                title = product_json.get('title')
                image = "https:" + product_json.get('featured_image', '')
                images_pre = product_json.get('images', [])
                images = ['https:' + img for img in images_pre]
                original_price = product_json.get('price_max', 0) / 100
                selling_price = product_json.get('price', 0) / 100
                brand = product_json.get('vendor')
                description = product_json.get('description')
                product_id = product_json.get('handle')
                options=product_json.get('options')
            else:
                print("No script tag found with ID 'ProductJson-product-template'.")
                return None            
            
            match2 = re.search(r'productVariants":\s*(\[{.*?}\])', response.text)
            #print(match2)
            if match2:
                product_variants_json = match2.group(1)
                #print(product_variants_json)
                product_variants = json.loads(product_variants_json)
                variant_list=[]
                for variant_value in product_variants:
                    variant_data = {
                        "id": variant_value['id'],
                        "price": variant_value['price']['amount'],
                        "variant_name": variant_value['title'],
                        "image": "https:" + variant_value['image']['src'],
                    }
                    for product_variant in product_json['variants']:
                        #print(product_variant['id'],variant_value['id'])
                        if int(product_variant['id']) == int(variant_value['id']):
                            #print("MATCH")
                            for option_index, option_value in enumerate(options):
                                #print(option_index,option_value)
                                option_value_index = product_variant['options'][option_index]
                                variant_data[str(option_value)] = option_value_index
                    variant_list.append(variant_data)
            else:
                variant_list = None

            final_json = {
                "brand": brand,
                "description": description,
                "image": image,
                "images": images,
                "variants": variant_list,
                "original_price": original_price,
                "selling_price": selling_price,
                "title": title,
                "url": "https://foreignfortune.com/products/" + product_id,
                "product_id": product_id,
                "category":category,
            }
            return final_json
        else:
            print(f"Failed to fetch the URL. Status code: {response.status_code}")
            return {}
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return {}

url = "https://foreignfortune.com/collections/men-unisex/products/foreign-fortune-collection-joggers-1"
category="Men/Unisex"
meta_data = scrape_meta_json(url,category)
print(meta_data)


{'brand': 'Foreign Fortune Clothing', 'description': 'Our Foreign Fortune Collection Joggers are great for the entire family. Very comfortable and versatile. They come in 5 different colors and can be worn during any season.', 'image': 'https://foreignfortune.com/cdn/shop/products/D30946DA-5D18-48D6-9890-63DC5DB36F77.jpg?v=1647614117', 'images': ['https://foreignfortune.com/cdn/shop/products/D30946DA-5D18-48D6-9890-63DC5DB36F77.jpg?v=1647614117', 'https://foreignfortune.com/cdn/shop/products/3EE2E126-721C-4DDE-A1EB-E3B355F2B674.jpg?v=1647614117', 'https://foreignfortune.com/cdn/shop/products/C8134772-8681-4EF4-810A-970DCBD13F28.jpg?v=1647614117', 'https://foreignfortune.com/cdn/shop/products/827B72E9-79B6-4401-AE76-29F3AFDA483F.jpg?v=1647614117', 'https://foreignfortune.com/cdn/shop/products/DD1E038F-07C5-410A-A846-A50187C8929E.jpg?v=1647614114', 'https://foreignfortune.com/cdn/shop/products/F2960363-A13C-427B-9E1C-9E5D79EC15D1.jpg?v=1647609938'], 'variants': [{'id': '41228637077697', 

# Scraping PDP Data and Storing in Final JSON

In [8]:
import time
import random
final_list=[]

for category_data in category_wise_dict:
    print(category_data)
    for category_links in category_wise_dict[category_data]:
        json_response = scrape_meta_json(category_links, category_data)
        final_list.append(json_response)
        time.sleep(random.uniform(1, 3))

with open('foreignfortune_pdp_data.json', 'w') as outfile:
    json.dump(final_list, outfile)

        

Men/Unisex
Women
Infant/Kid
Coats/Hats
TrackSuits
Foreign Rovalf
Accessories
