## Scrapfly Implementation

In [22]:
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
from bs4 import BeautifulSoup
from datetime import date
from time import sleep
import requests, json, configparser, os, boto3, math
import asyncio

### config files and objects

In [2]:
# Read config file
cd = os.getcwd()
config = configparser.ConfigParser()
config.read(f'{cd}/../config.cfg')

# Get api urls to scrape
ACCOUNT_URL = 'https://api.scrapfly.io/account'
FIVERR_URL = 'https://www.fiverr.com'
CATEGORIES_URL = f'{FIVERR_URL}/categories'

# Get the API key from the config file
MAIN_KEY = config.get('scrapyFly', 'api_key')
SEC_KEY  = config.get('scrapyFly', 'api_sec')

### S3 and Scrapfly objects

In [3]:
# Get the current date
today = date.today()

# Create a ScrapflyClient
scrapfly = ScrapflyClient(key=MAIN_KEY)

# create a new s3 client
s3_client = boto3.client('s3', 
    region_name = config.get('S3', 'region'),
    aws_access_key_id = config.get('S3', 'accessKeyId'), 
    aws_secret_access_key = config.get('S3', 'secretAccessKey'), 
)

# get bucket name
bucket_name = config.get('S3', 'bucketName')

### Functions

In [4]:
def account_info(key):
    """
    Get account info from Scrapfly API
    """
    response = requests.get(f'{ACCOUNT_URL}?key={key}')
    return response.json()

In [5]:
def get_remaining(key):
    """
    Get remaining requests from Scrapfly API
    """
    account = account_info(key)
    subscription = account.get('subscription')
    usage = subscription.get('usage')
    scrape = usage.get('scrape')
    return scrape.get('remaining')

In [46]:
print('account remaining requests: ', get_remaining(SEC_KEY))
print('account remaining requests: ', get_remaining(MAIN_KEY))

account remaining requests:  647
account remaining requests:  853


In [69]:
def get_soup(url):
    """
    get soup from url scraped from Scrapfly API
    """
    config = ScrapeConfig(url=url)
    response = scrapfly.scrape(scrape_config=config)
    html = response.scrape_result['content']
    return BeautifulSoup(html, 'html.parser')

In [10]:
def get_data(url):
    print('url: ', url)
    soup = get_soup(url)
    script = soup.select('script#perseus-initial-props')[0].text
    return json.loads(script)

In [74]:
import concurrent.futures

def get_concurrent_data(url_list):
    """
    get data collection from url list concurrently
    """
    data_collection = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        future_data = {executor.submit(get_data, url): url for url in url_list}
        for future in concurrent.futures.as_completed(future_data):
            url = future_data[future]
            try:
                data = future.result()
                data_collection.append(data)
            except Exception as exc:
                print(f'{url} generated an exception: {exc}')
    return data_collection

In [75]:
url_list = [
    "https://www.fiverr.com/categories/online-marketing/affiliate-marketing/links-promotion",
    "https://www.fiverr.com/categories/online-marketing/affiliate-marketing/affiliate-program-management",
    "https://www.fiverr.com/categories/online-marketing/mobile-app-marketing",
    "https://www.fiverr.com/categories/online-marketing/mobile-app-marketing/app-store-optimization",
    "https://www.fiverr.com/categories/online-marketing/mobile-app-marketing/app-promotion",
    "https://www.fiverr.com/categories/online-marketing/music-promotion",
    "https://www.fiverr.com/categories/online-marketing/music-promotion/organic-music-promotion",
    "https://www.fiverr.com/categories/online-marketing/music-promotion/paid-advertising",
    "https://www.fiverr.com/categories/online-marketing/music-promotion/music-streaming-services",
    "https://www.fiverr.com/categories/online-marketing/music-promotion/music-playlists-placements"
]

In [83]:
data_collection = get_concurrent_data([])

In [86]:
len(data_collection)
for data in data_collection:
    print(data)

In [8]:
def get_category(url):
    items = url.split('/')
    if len(items) < 5: items.append(items[3])
    return {
        'url': FIVERR_URL + url,
        'category': items[2],
        'subCategory': items[3],
        'nestedSubCategory': items[4]
    }

In [9]:
def get_categories():
    soup = get_soup(CATEGORIES_URL)
    categories = []
    items = soup.select('section > ul > li > a')
    for a in items:
        href = a.get('href')
        if href:
            category = get_category(href)
            categories.append(category)
    return categories

In [133]:
categories = get_categories()
categories

[{'url': 'https://www.fiverr.com/categories/graphics-design/creative-logo-design',
  'category': 'graphics-design',
  'subCategory': 'creative-logo-design',
  'nestedSubCategory': 'creative-logo-design'},
 {'url': 'https://www.fiverr.com/categories/graphics-design/brand-style-guides',
  'category': 'graphics-design',
  'subCategory': 'brand-style-guides',
  'nestedSubCategory': 'brand-style-guides'},
 {'url': 'https://www.fiverr.com/categories/graphics-design/game-art',
  'category': 'graphics-design',
  'subCategory': 'game-art',
  'nestedSubCategory': 'game-art'},
 {'url': 'https://www.fiverr.com/categories/graphics-design/game-art/character-design',
  'category': 'graphics-design',
  'subCategory': 'game-art',
  'nestedSubCategory': 'character-design'},
 {'url': 'https://www.fiverr.com/categories/graphics-design/game-art/props-objects-design',
  'category': 'graphics-design',
  'subCategory': 'game-art',
  'nestedSubCategory': 'props-objects-design'},
 {'url': 'https://www.fiverr.co

In [198]:
def save_json(data, key):
    s3_client.put_object(
        Bucket=bucket_name,
        Key=key,
        Body=json.dumps(data),
        ContentType='application/json'
    )

In [255]:
def get_pagination(data):
    """
    Get the total number of pages and offset from the data
    """
    appData = data.get('appData')
    pagination = appData.get('pagination')
    total = pagination.get('total')
    page_size = pagination.get('page_size')
    return math.ceil(total / page_size), pagination.get('offset')

In [1]:
def get_pages(category):
    """
    Get the total number of pages and data from the category with each service
    """
    # local variables
    services = []
    current = 1

    # get category data
    source = 'drop_down_filters'
    ref = 'seller_language%3Aen%7Cseller_location%3AUS'
    url =  f'{category.get("url")}?source={source}&ref={ref}'
    category_data = get_data(url)
    category_data['url'] = url
    category_data['date'] = today

    # partitions in subcategories
    categoryName = category.get("category")
    subCategory = category.get("subCategory")
    nestedSubCategory = category.get("nestedSubCategory")

    # save category data to s3
    categoryIds = category_data.get('categoryIds')
    title = '-'.join(categoryIds.values())
    key = f'fiverr/categories/{categoryName}/{subCategory}/{nestedSubCategory}/{title}.json'
    save_json(category_data, key)

    # get total services
    total, offset = get_pagination(category_data)
    total = min(total, 2)
    while current <= total:
        # get data
        services = services + category_data.get('items')
        if current == total:
            break

        # get next page
        current += 1
        source = 'pagination'
        ref = 'seller_language%3Aen%7Cseller_location%3AUS'
        url = f'{category.get("url")}?source={source}&ref={ref}&page={current}&offset={offset}'
        category_data = get_data(url)
        category_data['url'] = url
        category_data['date'] = today

    # save services files to s3
    for service in services:
        # get service info
        gig_url = FIVERR_URL + service.get('gig_url')
        gig_data = get_data(gig_url)
        gig_data['date'] = today
        # save to s3
        key = f'fiverr/accounts/{categoryName}/{subCategory}/{nestedSubCategory}/{gig_data.get("general").get("gigId")}.json'
        save_json(gig_data, key)
        sleep(1)

    return category_data

### Testing

In [None]:
for category in categories:
    get_pages(category)
    sleep(1)

In [233]:
pages = get_pages(categories[3:4])

graphics-design/game-art/character-design
https://www.fiverr.com/zestykale/make-you-a-minecraft-skin-from-scratch-or-reference
48 services


In [234]:
pages

[{'url': 'https://www.fiverr.com/categories/graphics-design/game-art/character-design',
  'category': 'graphics-design',
  'subCategory': 'game-art',
  'nestedSubCategory': 'character-design',
  'services': [{'gigId': 154369492,
    'pos': 0,
    'type': 'fixed_pricing',
    'is_fiverr_choice': False,
    'packages': {'recommended': {'id': 1,
      'extra_fast': False,
      'price': 5,
      'duration': 3,
      'type': 'cheapest'}},
    'sellerId': '87337744',
    'gigQueryParams': {'context_type': 'rating',
     'funnel': 'cf60e7faf5243925277bba1c7fef6d9d',
     'ref': 'seller_language:en|seller_location:US'},
    'impressionId': '89a6be51-f7b5-4ed5-984c-a4b60c59ccee',
    'gig_id': 154369492,
    'category_id': 3,
    'sub_category_id': 365,
    'nested_sub_category_id': 2249,
    'is_pro': False,
    'is_featured': False,
    'cached_slug': 'make-you-a-minecraft-skin-from-scratch-or-reference',
    'title': 'make you a minecraft skin from scratch or reference',
    'seller_name': 

In [178]:
url = FIVERR_URL + pages[0].get('services')[1].get('gig_url')
url

'https://www.fiverr.com/logoflow/do-professional-and-unique-logo-design'

In [11]:
data = get_data('https://www.fiverr.com/categories/graphics-design/creative-logo-design')
data

url:  https://www.fiverr.com/categories/graphics-design/creative-logo-design


{'categoryIds': {'categoryId': '3', 'subCategoryId': '49'},
 'translationsService': {'general': {'locale': 'en-US'},
  'filters': {'locale': 'en-US'},
  'categories': {'locale': 'en-US'},
  'seo': {'locale': 'en-US'}},
 'staticFilters': {'active': {},
  'filters': [{'values': [{'anchorText': {'en-US': 'modern logo design'},
      'header': {'en-US': 'modern logo design'},
      'slug': 'categories/graphics-design/buy/creative-logo-design/modern',
      'uid': 'modern',
      'encodedFilters': 'style:modern',
      'type': 'style',
      'id': 'modern',
      'alias': {'en-US': 'modern logo design'},
      'active': False,
      'filtersTypes': []},
     {'anchorText': {'en-US': 'vintage logo design'},
      'header': {'en-US': 'vintage logo design'},
      'slug': 'categories/graphics-design/buy/creative-logo-design/vintage',
      'uid': 'vintage',
      'encodedFilters': 'style:retro',
      'type': 'style',
      'id': 'retro',
      'alias': {'en-US': 'vintage logo design'},
      

In [14]:
new_dict = {}
new_dict.update(data.get('categoryIds'))
new_dict.update(data.get('displayData'))
new_dict.update(data.get('facets'))
new_dict


{'categoryId': 3,
 'subCategoryId': '49',
 'currentUser': {},
 'header': {'video': {'id': 'Graphic_Design',
   'name': 'How fiverr works? - general',
   'src': 'https://fiverr-res.cloudinary.com/video/upload/t_fiverr_hd/whuihiqnze1wmjjsjgmt',
   'en_src': 'https://fiverr-res.cloudinary.com/video/upload/t_fiverr_hd_nl/v1/video-attachments/generic_asset/asset/ab0907217c9f9a2c1d2eee677beb7619-1626082923646/how_fiverr_works'},
  'hasVideoLink': False,
  'title': 'Logo Design',
  'subtitle': 'Stand out from the crowd with a logo that fits your brand personality.'},
 'proBanner': {},
 'seo_data': {'title': 'Logo Design',
  'faqs': [{'question': 'What is logo design?',
    'answer': 'Logo design is the art of creating a visually stunning mark for a brand or company. A logo usually consists of a symbol, brandmark, or image that represents or symbolizes the company. a logo should stand out and be easily recognized.'},
   {'question': 'What makes a good logo?',
    'answer': "A good logo should 

In [187]:
data

{'general': {'gigId': 23197467,
  'gigStatus': 'approved',
  'categoryId': 3,
  'categoryName': 'Graphics & Design',
  'categorySlug': 'graphics-design',
  'subCategoryId': 49,
  'subCategoryName': 'Logo Design',
  'subCategorySlug': 'creative-logo-design',
  'nestedSubCategoryId': None,
  'nestedSubCategorySlug': None,
  'isOnVacation': False,
  'isBuyerBlocked': False,
  'isPro': False,
  'isHandpicked': False,
  'isStudio': False,
  'gigTitle': 'design 3 modern minimalist logo design',
  'encryptedGigId': 'FG3587DC6C1',
  'sellerId': 21449558,
  'traffiqed': False,
  'isSellerBlocked': False,
  'gigVisibleToSeller': True,
  'gigVisibleToBuyer': True,
  'isTrustedUser': False,
  'includeWorkSample': True},
 'outOfOffice': {'username': 'weperfectionist',
  'sellerId': 21449558,
  'isOnVacation': False,
  'endDate': None,
  'profilePhoto': 'https://fiverr-res.cloudinary.com/t_profile_original,q_auto,f_auto/attachments/profile/photo/ffc13a6326576eab1aa379f00ef48680-1629201228504/8ac708c

In [236]:
category = categories[4]
category

{'url': 'https://www.fiverr.com/categories/graphics-design/game-art/props-objects-design',
 'category': 'graphics-design',
 'subCategory': 'game-art',
 'nestedSubCategory': 'props-objects-design'}

In [237]:
category_url = category.get('url')# + '?source=drop_down_filters&ref=seller_language%3Aen%7Cseller_location%3AUS'
category_data = get_data(category_url)
category_data

{'categoryIds': {'categoryId': '3',
  'subCategoryId': '365',
  'nestedSubCategoryId': '2252'},
 'translationsService': {'general': {'locale': 'en-US'},
  'filters': {'locale': 'en-US'},
  'categories': {'locale': 'en-US'},
  'seo': {'locale': 'en-US'}},
 'staticFilters': {'active': {}, 'filters': []},
 'activeStaticFilterKeys': [],
 'bianka': {'serviceUrl': 'https://activity.fiverr.com/'},
 'userData': {'isRncUser': False},
 'isBusiness': False,
 'isBusinessUser': False,
 'flow': 'category',
 'activeFilters': {},
 'isV2Flow': False,
 'rollouts': {'cat_report_new_impressions': True,
  'zebras_discover_fib_modal': True,
  'zebras_back_to_fiverr': False,
  'bulls_promoted_gigs_banner_in_listings': True},
 'currency': {'name': 'EUR',
  'rate': 1.026701073,
  'template': '€{{amount}}',
  'forceRound': True,
  'forceRoundFromAmount': 1000,
  'symbol': '€'},
 'displayData': {'currentUser': {},
  'header': {'video': {'id': 'Graphic_Design',
    'name': 'How fiverr works? - general',
    'src'

In [238]:
categoryIds = category_data.get('categoryIds')
'-'.join(categoryIds.values())

'3-365-2252'

In [226]:
print(category_data.keys())
category_data.get('tracking')

dict_keys(['categoryIds', 'translationsService', 'staticFilters', 'activeStaticFilterKeys', 'bianka', 'userData', 'isBusiness', 'isBusinessUser', 'flow', 'activeFilters', 'isV2Flow', 'rollouts', 'currency', 'displayData', 'v2', 'dominateSubCategoryId', 'facets', 'priceBucketsSkeleton', 'requestContext', 'topBarSorting', 'listings', 'appData', 'tracking', 'shouldShowExpressDelivery', 'showMediumBucketsModalities', 'knownCrawler', 'userGuid', 'countryCode', 'assumedLanguage', 'subCategoryData', 'seoData', 'items', 'shouldAddLogoMakerBannerEnrichment', 'breadcrumbs', 'appFilters', 'dataLayerObject'])


{'pageName': 'Sub Category',
 'page': 1,
 'sort': 'rating',
 'filters': '',
 'categoryIds': {'categoryId': '3', 'subCategoryId': '363'},
 'numberOfResults': 399,
 'hasFiverrChoiceGigs': True,
 'fiverrChoiceGigPosition': 5,
 'promotedGigsCount': 0,
 'hasPromotedGigs': False,
 'localStorageData': {'entrySuffix': '|||sub_categories|||show|||undefined',
  'shouldStore': True},
 'reportData': {'group': 'algos',
  'type': 'listings_impression',
  'uid': '1659386728012-05d1c8e8-e5a1-4bb0-9ebd-44f9c3cbdab8',
  'listings': {'now_toggle_exists': False,
   'now_toggle_used': False,
   'pro_toggle_exists': False,
   'pro_toggle_used': False,
   'view_type': 'grid',
   'page_initial_language': 'en-US',
   'category_id': 3,
   'sub_category_id': 363,
   'page_ctx_id': '82684086a5af162c03b2c2a62b2679a8',
   'is_pro': False,
   'filter_tab': 'rating',
   'page_number': 1,
   'original_locale': 'en-US',
   'number_of_results': 399,
   'is_autocomplete': False,
   'context': 'sub_categories'},
  'page':

In [215]:
appData = category_data.get('appData')
pagination = appData.get('pagination')
pagination

{'page': 1, 'offset': -4, 'page_size': 48, 'total': 399}

In [218]:
total = pagination.get('total')
page_size = pagination.get('page_size')
offset = pagination.get('offset')
math.ceil(total / page_size)

9

In [94]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [93]:
l = [1,2,3,4,5,5,6,0,7,8,8]
l_list = list(chunks(l, 3))
for i in l_list:
    print(i)

[1, 2, 3]
[4, 5, 5]
[6, 0, 7]
[8, 8]
