# About
* **Author**: Adil Rashitov
* **Created at**: 21.06.2021
* **Goal**: perform test web scrapping for [118 direct](http://www.118.direct)
* **Deliverable**: Web scrapper of 118 direct

In [1]:
# Imports / Configs / Global vars

# Import of native python tools
import os
import json
from functools import reduce

# Import of base ML stack libs
import numpy as np
import sklearn as sc

# Multiprocessing for Mac / Linux
import platform
platform.system()
if platform.system() == 'Darwin':
    from multiprocess import Pool
else:
    from multiprocessing import Pool

# Visualization libraries
import plotly.express as px

# Logging configuraiton
import logging
logging.basicConfig(format='[ %(asctime)s ][ %(levelname)s ]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Ipython configs
from IPython.core.display import display, HTML
from IPython.core.interactiveshell import InteractiveShell
display(HTML("<style>.container { width:100% !important; }</style>"))
InteractiveShell.ast_node_interactivity = 'all'

# Pandas configs
import pandas as pd
import geopandas as gpd
pd.options.display.max_rows = 350
pd.options.display.max_columns = 250

# Jupyter configs
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False


import requests
from bs4 import BeautifulSoup
import re
import time
import sqlalchemy


# GLOBAL VARS
from dotenv import load_dotenv
load_dotenv('./.rds_endpoint')

True

# Main

In [57]:
postgres_uri = 'postgresql://AdilRashitov:AdilRashitov@localhost:5433/companies_contact_details'
source_stage_table = 'DIRECT_118_BUSINESS_CATEGORIES'
locations = ['Manchester', 'Liverpool']
target_stage_table = 'DIRECT_118_LOCATION_CATEGORY_URLS'
base_url = 'http://www.118.direct/listing/search?'


class Task3:
    
    @staticmethod
    def read_categories(args):
        import pandas as pd
        args['data'] = pd.read_sql(args['select_statement'],
                                   con=args['postgres_connection'])
        if args['data'].shape[0] == 0:
            logging.info("Nothing to read")
        return args

    @staticmethod
    def cartesian_product_location_and_category(args):
        from itertools import product

        # 1. Extraction arguments
        categories = args['data']['business_category']
        locations = args['locations']

        # 2. Cartesion product
        permutations = list(product(categories, locations))
        df = pd.DataFrame(permutations, columns=['what', 'where'])
        args['data'] = df

        return args

    @staticmethod
    def update_url(args):
        df = args['data']
        df['url'] = args['http_base_url'] + 'what=' + df['what'] + '&where=' + df['where']
        args['data'] = df
        return args

    @staticmethod
    def export_table(args: dict) -> dict:
        if args['data'].shape[0] > 0:
            args['data'].to_sql(args['target_stage_table'],
                                con=args['postgres_connection'],
                                index=False, if_exists='replace')
        else:
            logging.error("Nothing to export")

def task3(postgres_uri: str, source_stage_table: str,
          locations: list, target_stage_table: str,
          http_base_url: str):

    import logging
    import sqlalchemy

    args = {
        'postgres_connection': sqlalchemy.create_engine(postgres_uri),
        'select_statement': f'SELECT * FROM public."{source_stage_table}"',
        'locations': locations,
        'http_base_url': http_base_url,
    }
    
    task_steps = [
        Task3.read_categories,
        Task3.cartesian_product_location_and_category,
#         Task3.update_url
    ]
    
    for step in task_steps:
        logging.info(f"Start {step.__name__}")
        args = step(args)
        logging.info(f"Finish {step.__name__}\n")
    return args


In [58]:
_ = task3(
    postgres_uri=postgres_uri,
    source_stage_table=source_stage_table,
    locations=locations,
    target_stage_table=target_stage_table,
    http_base_url=base_url
)

[ 08/13/2021 10:15:19 PM ][ INFO ]: Start read_categories
[ 08/13/2021 10:15:19 PM ][ INFO ]: Finish read_categories

[ 08/13/2021 10:15:19 PM ][ INFO ]: Start cartesian_product_location_and_category
[ 08/13/2021 10:15:19 PM ][ INFO ]: Finish cartesian_product_location_and_category



In [56]:
_

{'postgres_connection': Engine(postgresql://AdilRashitov:***@localhost:5433/companies_contact_details),
 'select_statement': 'SELECT * FROM public."DIRECT_118_BUSINESS_CATEGORIES"',
 'locations': ['Manchester', 'Liverpool'],
 'http_base_url': 'http://www.118.direct/listing/search?',
 'data':                              popular_searches  \
 0     http://www.118.direct/popularsearches/a   
 1     http://www.118.direct/popularsearches/a   
 2     http://www.118.direct/popularsearches/a   
 3     http://www.118.direct/popularsearches/a   
 4     http://www.118.direct/popularsearches/a   
 ...                                       ...   
 1092  http://www.118.direct/popularsearches/w   
 1093  http://www.118.direct/popularsearches/w   
 1094  http://www.118.direct/popularsearches/w   
 1095  http://www.118.direct/popularsearches/y   
 1096  http://www.118.direct/popularsearches/z   
 
                       business_category                                  href  
 0                       

In [12]:
list(df['business_category'])

0                               Abattoirs
1                  Access Control Systems
2                        Access Equipment
3                             Accountants
4       Accounting & Bookkeeping Services
                      ...                
1092                             Worktops
1093      Workwear & Protective Equipment
1094                     Wrought Ironwork
1095                                 Yoga
1096                                 Zoos
Name: business_category, Length: 1097, dtype: object

In [17]:
import itertools
list(itertools.product(list(df['business_category']), ['a', 'b']))
 

2194