In [76]:
from selenium import webdriver
import time
from tqdm import tqdm
import numpy as np
import pandas as pd
import re
import os

In [2]:
driver = webdriver.Edge()

In [3]:
url = 'https://desibazaar.pl/en/13-foods-grocery'

In [4]:
driver.get(url)

In [5]:
product_sel = 'div.item-product.product_per_4.col-xs-12.col-sm-6.col-md-6.col-lg-4.col-xl-3'
prod_image_sel = '.first-image'
prod_flag_sel = 'ul.product-flag'
prod_name_sel = 'div.product_desc h3 a.product_name'
prod_price_sel = 'div.product_desc span.price'

In [6]:
DATA = []

In [7]:
def collect_data():
    for i in tqdm(range(34), total=34, desc='Processing...', colour='green'):
        products = driver.find_elements(by='css selector', value=product_sel)

        for product in products:
            try:
                prod_image_url = product.find_element(by='css selector', value=prod_image_sel).get_attribute("src")
            except:
                prod_image_url = np.nan
            try:
                prod_flag = product.find_element(by='css selector', value=prod_flag_sel).text
            except:
                prod_flag = np.nan
            try:
                prod_name = product.find_element(by='css selector', value=prod_name_sel).text
            except: 
                prod_name = np.nan
            try:
                prod_price = product.find_element(by='css selector', value=prod_price_sel).text
            except:
                prod_price = np.nan

            data = {
                'prod_image_url': prod_image_url,
                'prod_flag':  prod_flag,
                'prod_name': prod_name,
                'prod_price': prod_price
            }

            DATA.append(data)

        next_btn = driver.find_element(by='css selector', value='#js-product-list > nav > ul > li:nth-child(6) > a')
        next_btn.click()
        time.sleep(3)

In [8]:
collect_data()

Processing...: 100%|[32m███████████████████████████████████████████████████████████████████[0m| 34/34 [04:07<00:00,  7.29s/it][0m


In [13]:
desi_data = pd.DataFrame.from_dict(DATA)

In [14]:




desi_data.rename(columns={
    'prod_image_url': 'PROD_IMAGE_URL',
    'prod_flag': 'PROD_FLAG',
    'prod_name': 'PROD_NAME',
    'prod_price': 'PROD_PRICE'
}, inplace=True)

In [15]:
desi_data['STORE_ID'] = 156
desi_data['STORE_NAME'] = 'Desi Bazar Lebensmittelmarkt (Bangladeshi, Indian and Asian Groceries)'
desi_data['TEAM_MEMBER'] = 'Faromika Ifeoluwa'

In [16]:
desi_data.to_csv('desi_bazaar.csv', index=False)

### Second Website

In [36]:
categories = [
    'Lentil, Beans & Pulses',
    'Ready Meals & Mixes',
    'Frozen Foods',
    'Masala & Spices',
    'Dry Fruits & Nuts',
    'Chilled Foods',
    'Seeds & Grains',
    'Tin Foods',
    'Drinks & Beverages',
    'Powa & Mamra',
    'Sweets, Snacks & Savouries',
    'Pani Puri & Papads',
    'Oils & Ghee',
    'Jaggery & Sugars',
    'Instant Mixes',
    'Food Colours & Flavours',
    'Noodles & Vermicelli',
    'Cooking Salts & Powders'
]

In [92]:
def collect_data_by_cat(category):
    DATA = []
    category = re.sub(r'[,\s+&]', '-', category).lower()
    category = re.sub(r'-+', '-', category).lower()
    url = f"https://bazaar-foods.co.uk/collections/{category}"
#     https://bazaar-foods.co.uk/collections/lentil-beans-pulses
#     print(url)
    driver = webdriver.Edge()
    driver.get(url)
    
    time.sleep(4)
    
    prod_blocks = driver.find_elements(by='css selector', value='div.product-block__inner')
    
    for prod in tqdm(prod_blocks, colour='green'):
        
#         prod_opt_btn = prod.find_element(by='css selector', value='#collection > div > div.product-list-container.product-list-container--sidebar.cc-animate-init.-in.cc-animate-complete > div.product-list.product-grid.row.grid.use-infinite-scroll > div:nth-child(1) > div > div.product-form > a')
#         prod_opt_btn.click()
        img_url_sel = 'div.inner > a'
        prod_name_sel = '.product-block__title-price'
#         prod_desc_sel = '.product-description.product-hide-mobile.rte.product-detail__gap-lg'
        price_sel = 'span.amount.theme-money'
#         prod_link_sel = '.quick-buy.btn'
        
        img_url = prod.find_element(by='css selector', value=img_url_sel).get_attribute("href")
        prod_name = prod.find_element(by='css selector', value=prod_name_sel).text
        price = prod.find_element(by='css selector', value=price_sel).text
#         prod_link = prod.find_element(by='css selector', value=prod_link_sel).get_attribute("href")
        
        data = {
            'img_url': img_url,
            'prod_name': prod_name,
            'price': price
        }
        
        DATA.append(data)
    df = pd.DataFrame.from_dict(DATA)
    df['category'] = category
    df.to_csv(os.path.join('DESI_BAZAAR_2', f"{category.replace('-', '_')}.csv"), index=False)
    print('Done.......')
    return DATA

In [73]:
DATA = collect_data_by_cat(categories[0])

https://bazaar-foods.co.uk/collections/lentil-beans-pulses


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.12it/s]


In [74]:
DATA

[{'img_url': 'https://bazaar-foods.co.uk/collections/lentil-beans-pulses/products/heera-toor-dall-plain',
  'prod_name': 'Heera Toor Dall Plain\nFrom £2.19',
  'price': '£2.19'},
 {'img_url': 'https://bazaar-foods.co.uk/collections/lentil-beans-pulses/products/heera-moong-dall-washed',
  'prod_name': 'Heera Moong Dall Washed\nFrom £1.85 £5.69',
  'price': '£1.85'},
 {'img_url': 'https://bazaar-foods.co.uk/collections/lentil-beans-pulses/products/heera-chana-dall',
  'prod_name': 'Heera Chana Dall\nFrom £1.59',
  'price': '£1.59'},
 {'img_url': 'https://bazaar-foods.co.uk/collections/lentil-beans-pulses/products/east-end-moong-whole-beans',
  'prod_name': 'East End Moong Whole Beans\nFrom £1.69',
  'price': '£1.69'},
 {'img_url': 'https://bazaar-foods.co.uk/collections/lentil-beans-pulses/products/udaya-toor-dall',
  'prod_name': 'Udaya Toor Dall\nFrom £3.19',
  'price': '£3.19'},
 {'img_url': 'https://bazaar-foods.co.uk/collections/lentil-beans-pulses/products/heera-chick-peas',
  'pro

In [77]:
os.makedirs('DESI_BAZAAR_2')

In [85]:
df = pd.DataFrame.from_dict(DATA)
df['category'] = 'Lentils, Beans & Pulses'

In [88]:
# df

In [86]:
df.to_csv(os.path.join('DESI_BAZAAR_2', 'lentils_beans_pulses.csv'), index=False)

In [95]:
for category in categories[8:]:
    collect_data_by_cat(category)

100%|[32m██████████████████████████████████████████████████████████████████████████████████[0m| 50/50 [00:03<00:00, 14.09it/s][0m


Done.......


100%|[32m██████████████████████████████████████████████████████████████████████████████████[0m| 18/18 [00:01<00:00, 14.83it/s][0m


Done.......


100%|[32m██████████████████████████████████████████████████████████████████████████████████[0m| 50/50 [00:06<00:00,  7.59it/s][0m


Done.......


100%|[32m██████████████████████████████████████████████████████████████████████████████████[0m| 47/47 [00:03<00:00, 13.82it/s][0m


Done.......


0it [00:00, ?it/s]


Done.......


100%|[32m██████████████████████████████████████████████████████████████████████████████████[0m| 21/21 [00:01<00:00, 12.91it/s][0m


Done.......


0it [00:00, ?it/s]


Done.......


100%|[32m██████████████████████████████████████████████████████████████████████████████████[0m| 13/13 [00:00<00:00, 16.40it/s][0m


Done.......


100%|[32m██████████████████████████████████████████████████████████████████████████████████[0m| 14/14 [00:01<00:00, 13.33it/s][0m


Done.......


100%|[32m██████████████████████████████████████████████████████████████████████████████████[0m| 19/19 [00:01<00:00, 15.08it/s][0m


Done.......


In [97]:
df

Unnamed: 0,img_url,prod_name,price,category
0,https://bazaar-foods.co.uk/collections/lentil-...,Heera Toor Dall Plain\nFrom £2.19,£2.19,"Lentils, Beans & Pulses"
1,https://bazaar-foods.co.uk/collections/lentil-...,Heera Moong Dall Washed\nFrom £1.85 £5.69,£1.85,"Lentils, Beans & Pulses"
2,https://bazaar-foods.co.uk/collections/lentil-...,Heera Chana Dall\nFrom £1.59,£1.59,"Lentils, Beans & Pulses"
3,https://bazaar-foods.co.uk/collections/lentil-...,East End Moong Whole Beans\nFrom £1.69,£1.69,"Lentils, Beans & Pulses"
4,https://bazaar-foods.co.uk/collections/lentil-...,Udaya Toor Dall\nFrom £3.19,£3.19,"Lentils, Beans & Pulses"
5,https://bazaar-foods.co.uk/collections/lentil-...,Heera Chick Peas\nFrom £1.75,£1.75,"Lentils, Beans & Pulses"
6,https://bazaar-foods.co.uk/collections/lentil-...,East End Urid Dall Washed\nFrom £2.19,£2.19,"Lentils, Beans & Pulses"
7,https://bazaar-foods.co.uk/collections/lentil-...,East End Moong Dall Washed\nFrom £3.39,£3.39,"Lentils, Beans & Pulses"
8,https://bazaar-foods.co.uk/collections/lentil-...,East End Black Eye Beans\nFrom £1.89,£1.89,"Lentils, Beans & Pulses"
9,https://bazaar-foods.co.uk/collections/lentil-...,Udaya Roasted Gram Split\nFrom £1.69,£1.69,"Lentils, Beans & Pulses"


In [98]:
df_combined = pd.DataFrame()

In [102]:
for file in os.listdir('DESI_BAZAAR_2'):
    df = pd.read_csv(os.path.join('DESI_BAZAAR_2', file))
    df_combined = pd.concat([df_combined, df], axis=0, ignore_index=True)

In [103]:
df_combined.shape

(595, 4)

In [105]:
df_combined.isna().sum()

img_url      0
prod_name    0
price        0
category     0
dtype: int64

In [106]:
df_combined.rename(
    columns={
        'img_url': 'PROD_IMAGE_URL',
        'prod_name': 'PROD_NAME',
        'price': 'PROD_PRICE',
        'category': 'PROD_CATEGORY'
    },
    inplace=True
)

In [107]:
df_combined['STORE_NAME'] = 'Desi Bazar Lebensmittelmarkt (Bangladeshi, Indian and Asian Groceries)'
df_combined['STORE_ID'] = 280
df_combined['TEAM_MEMBER'] = 'Faromika Ifeoluwa'

In [108]:
df_combined.to_csv('desi_bazaar_2.csv')