In [1]:
import logging
import requests
import pandas
import json

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas


In [2]:
logging.basicConfig(filename='logFile.log', encoding='utf-8', format='%(asctime)s - %(message)s', level=logging.INFO)
logging.info('Started')

In [3]:
session = requests.Session()

In [4]:
eyeglasses_base_url = 'https://api-gateway.juno.lenskart.com/v2/products/category/3363?page-size=1000'

In [15]:
#To scrape the product data from product page
def get_product_details(url:str):
    session = requests.Session()
    try:
        res = session.get(url)
        source_page_content = res.text
        start_pattern = '"productDetailData":'
        end_pattern = '"cmsData":{'
        start_index = source_page_content.find(start_pattern)
        end_index = source_page_content.find(end_pattern,  start_index)
        if start_index != -1 and end_index != -1:
            json_data_str = source_page_content[start_index + len(start_pattern):end_index].rstrip(',')
            return json_data_str
    except Exception as e:
        #print("Error getting product data", e)
        return None

In [19]:
def add_response_to_list(res, df_list):

    res_json = json.loads(res.text)
    products_list = res_json["result"]["product_list"]

    for product in products_list:
        product_dict = product

        # flatten prices
        for price_details in product["prices"]:
            product_dict[price_details["name"]] = price_details["price"]

        # flatten hashtags
        for hashtag_details in product["hashtagList"]:
            if("property" in hashtag_details):
                if("name" in hashtag_details):
                    product_dict[hashtag_details["property"]] = hashtag_details["name"]
                else:
                    product_dict[hashtag_details["property"]] = 'NULL'

        # go to product url and get its details
        product_details = get_product_details(product["product_url"])
        logging.info(f"get product details called for {product["id"]} with url {product["product_url"]}")
        
        if(product_details is not None):
            product_details_json = json.loads(get_product_details(product["product_url"]))

            product_dict["productName"] = product_details_json["productName"]
            # get each general product info in new col
            for product_info in product_details_json["generalProductInfo"]:
                product_dict[product_info["nameEn"]] = product_info["valueEn"]

        
        df_list.append(product_dict)

In [10]:
print(type(get_product_details('https://www.lenskart.com/hooper-hp-e10077-c2-eyeglasses.html')))

<class 'str'>


In [21]:
df_list = []

In [22]:
for page_no in range(5):
    url = eyeglasses_base_url + f'&page={page_no}'
    file_path = f'./pages/product_list_pages/page{page_no}.json'

    # make get request to url
    page_response = session.get(url)
    logging.info(f"Response status for {url}: {page_response.status_code}")
    
    # writ to html
    with open(file_path, 'wb') as curr_page_file:
        curr_page_file.write(page_response.content)

    logging.info(f"Response contents from {url} written to: {file_path}")

    add_response_to_list(page_response, df_list)

In [13]:
df_list

[{'id': '146012',
  'image_url': 'https://static5.lenskart.com/media/catalog/product/pro/1/thumbnail/628x301/9df78eab33525d08d6e5fb8d27136e95//l/i/transparent-gold-full-rim-rectangle-lenskart-air-essentials-la-e13517-c2-eyeglasses_csvfile-1695816787888-g_7549_0_image_pla.jpg',
  'image_url_suffix': '/l/i/transparent-gold-full-rim-rectangle-lenskart-air-essentials-la-e13517-c2-eyeglasses_csvfile-1695816787888-g_7549_0_image_pla.jpg',
  'imageUrls': [],
  'product_url': 'https://www.lenskart.com/lenskart-air-la-e13517-c2-eyeglasses.html',
  'color': 'Transparent',
  'size': 'Medium',
  'width': '134 mm',
  'brand_name': 'Lenskart Air',
  'brand_name_en': 'Lenskart Air',
  'brand_logo': 'https://static.lenskart.com/media/wysiwyg/blanco/images/air-logo-Hi.jpg',
  'model_name': 'LA E13517',
  'prices': [{'name': 'Market Price', 'currency_code': 'INR', 'price': 1700},
   {'name': 'Lenskart Price', 'currency_code': 'INR', 'price': 1700}],
  'is_tbyb': False,
  'tags': 'Air Essentials',
  'col

In [20]:
df = pandas.DataFrame(df_list)

eyeglasses_csv_path = './csvs/eyeglasses.csv'
df.to_csv(eyeglasses_csv_path)
logging.info(f"Product details of eyeglasses converted to data frame and csv - {eyeglasses_csv_path}")