<a href="https://colab.research.google.com/github/Echo9k/WebScrapping/blob/main/WebScrapping_Recursive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up

In [None]:
#@title Install libraries
#@markdown use only in colab
!pip install w3lib
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [None]:
#@title Set up
#@markdown Loading dependencies...
import os
import re
import scipy
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime

# HTML
import requests
from bs4 import BeautifulSoup as bs
from requests.exceptions import HTTPError
from IPython.core.display import display, HTML
from urllib.parse import unquote
# import mechanize

# Selenium for JS support
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver

In [None]:
#@title Headless
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

#@markdown As: headles_driver
headles_driver = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

In [None]:
#@title PhantomJS
#@markdown As: phantom_driver
from selenium import webdriver
!wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
!tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2
!cp phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local/bin
!ls -al
phantom_driver = webdriver.PhantomJS()

# Definitions

In [None]:
#@title Regex
#@markdown Regex parameters and labels
brand_regex = r"(?:brand|brandname|vendor|manufacturer|product-brand)(?![&])(.)"
crumb_regex = r"(?:category|categories|category path|breadcrumbs|breadcrum|crumb|navbar|Product Category)(?![&])(.)"
upc_regex = r"(?:sku|model|model id|model no|item number|itemid|article no|product number|style number|product id|item code|mfr no|data-product)(?![&])(.)"
model_regex = '(?:sku|model|model id|model no|item number|itemid|article no|product number|style number|product id|item code|mfr no|data-product)(?![&])(.)'
mfr_regex = r'(?:"UPC"|"GTIN"|"EAN"|"upc"|"upccode"|"product_upc"|"product:upc"|"gtin"|"ean"|"barcode")'
part_regex = r"(?:PN|P/N|part no|part number|part|part #|mpn)(?![&g])(...)"
color_regex = r"(?:color|color_name|shade|finish|shade description)(?![&])(.)"
size_regex = r"(?:selected size|available size|choose a size|product size|attribute pa size)(?![&])(.)"
mfr_regex = r"(?:manufacturer|mfr|mfg|manufacturer logo|manufacturer name|label|producer|fabricante|fabrikant|Hersteller)(?![&])(.)"
price_regex = r"(?:MSRP|MRP|Recommended Customer Price|USD MSRP|List Price|reseller price may vary)(?![&])(.)"
ct_regex = r"(?:count|pieces|ct|pc|combo|per pack|contains)(?![&])(.)"
pk_regex = r"(?:packs|packs of|pk|package|combo|carton|carton pack)(?![&])(.)"
description_regex = r"(?:Product Details|Specification|Tech specs|Technical specification|Details|see more features|Product Description|Description|About the product|ingredients|Where to use|How to use)(?![&])(.)"

labels= {"Brand Name":brand_regex,
        "Category Name":crumb_regex,
        "Model Name":model_regex,
        "UPC":upc_regex,
        "Part Number":part_regex,
        "Color name":color_regex,
        "Size Name":size_regex,
        "Manufacturer Name":mfr_regex,
        "List Price":price_regex,
        "Item Count":ct_regex,
        "Item Package Quantity":pk_regex,
        "Product Description":description_regex}


In [None]:
#@title Functions
#@markdown * Search tag
#@markdown * Finder
def search_tag(tag, string):
    regex=r"(?:"+tag+"=)"
    if tag in string:
        split_1 = re.split(regex,string)[1].replace('%20', ' ')
        print(f"{tag}: found in text\n"
            f"contains:{split_1}")

#@title Finder function
def finder(regex:str, text:str,*,
           look_before:int=10,
           look_ahead:int=250,
           extra_dots=1) -> str:
    """
    # RETURNS: group found, match
    """
    matches = re.finditer(regex, text, re.MULTILINE | re.IGNORECASE | re.UNICODE)
    
    for matchNum, match in enumerate(matches, start=1):
        print(f"Match: {matchNum} {match.group()}")
        # , match = match.group()
        
        for groupNum in range(0, len(match.groups())):
            groupNum = groupNum + 1
            return match.group()[:-extra_dots], match.string[match.start(groupNum)-look_before:match.end(groupNum)+look_ahead]

def get_attribute(id, attribute):
    return headles_driver.find_element_by_id(id).get_attribute(attribute)
        
class page_loader:
    "This is a page class"
    def __init__(self, url, headless=False, phantom=False):
        self.url = url
        self.response = bs(requests.get(url).text)
        self.parsed_url = unquote(url)
        self.headless_driver = None
        self.phantom_driver = None
        self.variant = np.squeeze(re.findall(r"=(.*)", self.url))
        if headless:
            self.headless_driver = get_headless_driver()
        if phantom:
            self.phantom_driver = get_phantom_driver()

    def get_headless_driver(self):
        return headles_driver.get(self.url)
    
    def get_phantom_driver(self):
        return phantom_driver.get(self.url)

    def build_headless(self):
        self.headless_driver = self.get_headless_driver()

    def build_phantom(self):
        self.phantom_driver = self.get_phantom_driver()

# Recursive search

## Preliminaries

In [None]:
#@title Preliminaries

#@markdown * Import libraries
import json
from w3lib.html import replace_entities
import pandas as pd
from tqdm import tqdm
from copy import deepcopy
from google.colab import data_table

#@markdown * Functions
def load_URLs(URL):
    r = requests.request('GET', URL)
    return bs(r.text), phantom_driver.get(URL), headles_driver.get(URL)

#@markdown * Retriving data

#@markdown *List of useful columns
#["url","brand_name","manufacturer_name","product_description","model","color_name","item_package_quantity", "unit_count"]
all_columns=["title","bread_crumb1","bread_crumb2","bread_crumb3","brand_name",
             "manufacturer_name","model","upc","color_name","size_name",
             "item_package_quantity","part_number","list_price","unit_count",
             "product_description"]
useful_cols =  ["url","brand_name","manufacturer_name","product_description","model","color_name","item_package_quantity", "unit_count"]#@param {type:"raw"}

file_name = "demeterfragrance.csv" #@param {type:"string"}
if useful_cols is None:
    data = pd.read_csv(file_name)
else:
    data = pd.read_csv(file_name, usecols=useful_cols)

#@markdown Data loaded as: data
## separating URLs
url_variant = data.url[data.url.apply(lambda x: "variant" in x)]
not_variant = data.url[data.url.apply(lambda x: "variant" not in x)]

print(f"Not variant URLs: {len(not_variant)}\n"
      f"URL with variant: {len(url_variant)}")

In [None]:
#@title Web scrapped df
#@markdown Create a new dataframe (df) to store the web scrapped values.
df = pd.DataFrame([page_loader(url) for url in data.url], columns=["page"])

## Retriving the data

In [None]:
#@title Example page
#@markdown * page
#@markdown * response
index =  50#@param {type:"integer"}
page = df.page[index]
print(f"url: {page.url}")
response = page.response

In [None]:
#@title Show df
drop_cols = None#@param {type:"raw"}
include_index = True #@param {type:"boolean"}

if drop_cols is None:
    display(data_table.DataTable(df,include_index))
else:
    display(data_table.DataTable(df.drop(drop_cols, 1),include_index))

### Title

In [None]:
# def get_title(page):
#     try: 
#         return page.response.title.text
#     except:
#         print(f"error in {page.url}")
#         return np.nan

def get_title(page):
    try: 
        return page.response.find_all('div', {'class':"prod_line_wrap"})
    except:
        print(f"error in {page.url}")
        return np.nan

title = df.page.apply(get_title)
# title = [o.response.find_all('h1')[0].text for o in df['page']]
# title = [o.response.title.text for o in df['page']]
# df['title'] = title

### Metadata from scripts

In [None]:
prod_line_wrap = []
for page in df.page:
    # get the page's info
    variant = str(page.variant)
    divs = page.response.find_all('div', {'class':"prod_line_wrap"})

    # For each divisor in the given page
    for div in divs:
        scode = div.find_all('form')[0].attrs['data-scode']
        if scode == variant:
            title = div.find_all('input', {'name':'name'})[0]
            barcode = div.find_all('input', {'name':'code'})[0]
            prod_line_wrap.append({
                'url' : page.url,
                'title' : title.attrs['value'],
                'barcode' : barcode.attrs['value']
                }
            )

### brand

In [None]:
request = page.response
variant = np.squeeze(re.findall(r"=(.*)", page.url))
[script for script in bs(request.text).find_all('script') if 'barcode' in script.text]

### barcode

In [None]:
def get_barcode(page):
    request = page.response
    variant = np.squeeze(re.findall(r"=(.*)", page.url))
    script = [script for script in bs(request.text).find_all('script') if (variant in script.text) and 'barcode' in script.text]
    return script
    

In [None]:
#@markdown variant_parsed
int_cast = True #@param {type:"boolean"}

df['variaant_parsed'] = [re.findall(r"=(.*)", i.parsed_url) for i in df['page']]
df['variaant_parsed'] = df['variaant_parsed'].map(lambda x: np.squeeze(x))


def variant_parsed_fixer(x):
    x = (x,None)[len(x)==0]
    try:
        return int(x)
    except:
        return np.nan
    

if int_cast:
    df['variaant_parsed'] = df['variaant_parsed'].map(variant_parsed_fixer)
else:
    df['variaant_parsed'] = df['variaant_parsed'].map(lambda x: (x,None)[len(x)==0])

#### More extractions

In [None]:
#@markdown df with metadat extracted
test = []
for i in product_meta:
    if i is np.nan:
        test.append({'id':None,
                     'public_title':None,
                     'name':None,
                     'sku':None})
    else:
        test.append({'id':i['id'],
                     'public_title':i['public_title'],
                     'name':i['name'],
                     'sku':i['sku']})

df_meta = pd.concat([df, pd.DataFrame(test)], axis=1)

In [None]:
#@markdown Breadcrumbs
breadcrumb = [o.response.find_all('div', {'class':'breadcrumb'})[0].text for o in df['page']]
breadcrumbs = [b.split('»')[:-1] for b in breadcrumb]
single_space_breadcrumbs = [[re.sub(' +', ' ', each_breadcrumb)\
                                .replace('\n','')\
                                .rstrip()
                            for each_breadcrumb in breadcrumb_list]
                            for breadcrumb_list in breadcrumbs]

breadcrumb_1 = []
breadcrumb_2 = []
breadcrumb_3 = []

for x in single_space_breadcrumbs:
    # Breacdrumb 1
    breadcrumb_1.append(x[0])
    # Breacdrumb 2
    try:
        breadcrumb_2.append(x[1])
    except:
        breadcrumb_2.append(None)
    # Breacdrumb 3
    try:
        breadcrumb_3.append(x[2])
    except:
        breadcrumb_3.append(None)

df['bread_crumb1'] = breadcrumb_1
df['bread_crumb2'] = breadcrumb_2
df['bread_crumb3'] = breadcrumb_3

In [None]:
#@markdown Product Codes

# re.match('(?:Product Code)(...........)',code.text)

# codes = [page.response.find_all('div', {'class':'description'}) for page in df.page]

product_codes = []
for code in codes:
    child = [c for c in code[0].children]
    child_df = pd.DataFrame(child,columns=['val'])
    val = [i+1 for i, c in enumerate(child_df['val']) if 'Product Code:' in c][0]
    product_codes.append(child_df['val'][val].strip().split(','))

df['model'] = pd.Series(map(lambda x: (np.nan, x[0])[len(x)==1], product_codes))

In [None]:
#@markdown package_size
pk = []
for text in df.sku:
    try:
        pk.append(int(re.search(r'(\d*)(?:-)[Pp](?:ack)', text).group(1)))
    except:
        pk.append(None)
df['item_package_quantity'] = pk

In [None]:
#@markdown Description
def get_description(page):
    description = page.response.find_all('div', attrs={'itemprop':'description'})
    try:
        return description[0].text\
                            .strip('\n')\
                            .strip('\xa0')\
                            .strip('\n')\
                            .strip('\xa0')\
                            .strip('\n')
    except:
        return description

In [None]:
description = df.page.map(get_description)
description = description.map(lambda x: np.squeeze(x))

In [None]:
df.page.map(get_description)

In [None]:
# index_label=df.page.apply(lambda x: x.url)
# df.set_index(index_label,inplace=True)
df.to_excel('hi-hyperlite_reviewd.xlsx', 
          columns=['title', 'id', 'public_title', 'name', 'sku',
                   'item_package_quantity', 'unit_count','product_description'])

In [None]:
#@markdown unit_count
pcs = []
for text in df.sku:
    try:
        pcs.append(int(re.search(r'(\d*)[Pp](?:cs)', text).group(1)))
    except:
        pcs.append(None)
df['unit_count'] = pcs

In [None]:
#@markdown Color
tentative_color = []

for t in df.title:
    try:
        tentative_color.append([t, t.split(' -')][1])
    except:
        tentative_color.append(None)
color = pd.DataFrame(tentative_color,
                     columns=['title', 'color'])
# data_table.DataTable(color)
df['color'] = color.color

#### Metadata

In [None]:
#@markdown Function:    **get_meta** <br>
#@markdown Column:      **metadata**
def get_meta(page):
    response = page.response
    scripts = response.find_all('script')
    meta = ''
    for s in scripts:
        if 'var meta' in s.text:          # find the script of interest
            meta = s.text
            meta = meta.split('var meta = ')[1].split(';')[0]
    return json.loads(meta)

df['metadata']=df.page.apply(get_meta)

In [None]:
#@markdown Product metadata: df[\'product_meta']
product_meta = []
for i in range(len(df)):
    try:
        product = [product for product in df.metadata[i]['product']['variants']
                   if product['id'] == df['variaant_parsed'][i]][0]
        product_meta.append(product)
    except:
        product_meta.append(np.nan)
        
df['product_meta'] = product_meta

## Others

In [None]:
pd.DataFrame(description).loc[0,'description']

In [None]:
pd.DataFrame(description).to_excel('description.xlsx')

In [None]:
# <div id="ProductPfidComponent_ProductName_3" class="ProductPfidComponent" data-block="ProductName">
description = []
for page in df.page:
    response = page.response
    attrs= {'class':'product-description'}
    
    findings = response.find_all('div',attrs=attrs)
    try:
        description.append({'url' : page.url,
                            'description' : findings[0].text.strip('\n')})
    except:
        description.append({'url' : page.url,
                            'description' : False})

In [None]:
response.decode_contents

## json

In [None]:
# URL = data.url[0]
# text = requests.request('GET', URL).text
# base = 'data-product-sku="'
# reg = '(?='+base+')(.*)(?=")'
# found = finder(reg, text)

# found = [i for i in found]
# found = [i+';}' for i in found[1].split(';}')[:-1]]
# found = ''.join([str(n) for n in found])
# json.loads())

In [None]:
json.loads(found.split('[',1)[1].rsplit(']',1)[0])

In [None]:
#@markdown Find Attributes
dc = json.loads(finder(r'(?:var meta = )(.*)(?:};)', s.text)[0].replace('var meta = ',''))
try:
    variant_num = URL.split('variant=')[1]
    variant = [i for i in dc['product']['variants'] if i['id']==int(variant_num)][0]
except IndexError:
    variant = dc['product']['variants'][0]
#@markdown * title
title = driver.find_element_by_class_name("standard-single").text

#@markdown * Public_title  | _has the size values_
public_title = variant['public_title'] 

#@markdown * brand
brand = finder(r"(?:\"brand\")(.)", s.text, look_ahead=20)[1].split(":")[1].replace('"','')

#@markdown * manofacturer
try:
    label = dc['product']['label']
except:
    label = finder(r"(?:\"label\")(.)", s.text, look_ahead=20)[1].split(":")[1].replace('"','')

#@markdown * Price
price = wd.find_element_by_id('ProductPrice')

# Fase 2 | Integration

In [None]:
data.set_index('url',inplace=True)
clean_responses = pd.read_excel('response.xlsx',index_col='url')

In [None]:
set_cr = set(clean_responses.columns)
set_data = set(data.columns)
set_cr.symmetric_difference(set_data)

In [None]:
clean_responses.rename(columns={'price':'list_price',
                        'sku':'upc',
                        'description':'product_description',
                        }, inplace=True)

In [None]:
set_cr = set(clean_responses.columns)
set_data = set(data.columns)
data_clean = clean_responses.join(data.drop(columns=set_cr.intersection(set_data)))
data_clean[data.columns].to_csv('clean_data.csv')